From c8eaf7a2f8e92e74ff1899c7ae9861d34172e348 Mon Sep 17 00:00:00 2001 From: Siyuan Zhang Date: Tue, 20 Dec 2022 09:49:22 +0800 Subject: [PATCH] Launch engines with each components inside their own container. (#2247) Changes on interface: 1. Images identifier are controlled by registry and tag, the image name is fixed by convention. 2. Users now could control a subset of components to be launched. by `with_analytical`, `with_analytical_java`, `with_interactive` and `with_learning`. Additionally, user could use `with_dataset` to launched a dataset container. ```python sess = graphscope.session( k8s_image_registry="registry.cn-hongkong.aliyuncs.com", # the default value. Could be "" to use local images k8s_image_tag="0.19.0", # default __version__ num_workers=2, with_analytical=True, # default True with_analytical_java=False, # default False with_interactive=True, # default True with_learning=True, # default True, k8s_vineyard_image="ghcr.io/v6d-io/v6d/vineyardd:v0.11.1", # the default value. ) # the session will have a coordinator pod, 2 engine pod, which have 4 containers. coordinator is a deployment with 1 replicas engine is a statefulset with have 2 replicas. etcd is started by vineyard container, which is a small and handy image. ``` Major changes are: 1. Refactored CI pipelines 1. Dedicate a k8s CI workflow by building images 2. Dedicate a local CI workflow by installing built graphscope wheel. 3. Amend dummy ci workflows 4. Do not upload GIE log when succeeded. 2. Increased coordinator launch speed by reduce the initial delay seconds, the delay is not necessary as coordinator started very fast. 3. Rename `mount_dataset` to `with_dataset` and mount to a fixed path to align with other `with_X` jargons. 4. Updated GraphScope Helm charts accordingly 5. GraphScope cluster use statefulsets instead of deployments 6. Use standard vineyard image 7. Get rid of etcd pods and rely on vineyard image to start etcd 8. Frontend now has its own pod. 9. Vineyardd is launched in the container args. 10. Disabled Mars 11. Use classes defined in Kubernetes client instead of self-maintained JSON to build resources. 12. Move compiling process to engine pod 13. Refactored issue commands snippets, gives error logs when failed. 14. Optimize many inefficient logging statements, and use f-string over str.format 15. Add `.git` to .dockerignore and commented out the `*.pb.*` pattern. 16. Handle exceptions when delete dangling coordinator 17. Remove cmake from requirements.txt of coordinator, as installed cmake is broken. 18. Upload a dummy dev image `graphscope-dev:ci` for CI process 19. Install rapidjson and msgpack to analytical container 20. Install Hadoop and get rid of its huge docs directory to vineyard-dev. 21. Enable launch without analyticlal engine. --- .dockerignore | 7 +- .../build-graphscope-images-linux.yml | 144 ++ .../build-graphscope-wheels-linux.yml | 18 +- .github/workflows/gaia.yml | 2 +- .github/workflows/k8s-ci-dummy.yml | 115 ++ .github/workflows/k8s-ci.yml | 630 +++++++ .../{ci-dummy.yml => local-ci-dummy.yml} | 16 +- .github/workflows/{ci.yml => local-ci.yml} | 307 +--- Makefile | 2 - README-zh.md | 6 +- README.md | 6 +- .../core/loader/dynamic_to_arrow_converter.h | 1 + charts/graphscope/templates/_helpers.tpl | 24 + charts/graphscope/templates/coordinator.yaml | 60 +- .../templates/role_and_binding.yaml | 2 +- charts/graphscope/values.yaml | 102 +- coordinator/gscoordinator/cluster_builder.py | 629 +++++++ coordinator/gscoordinator/coordinator.py | 193 ++- .../gscoordinator/kubernetes_launcher.py | 1253 +++++--------- coordinator/gscoordinator/launcher.py | 7 + coordinator/gscoordinator/local_launcher.py | 29 +- coordinator/gscoordinator/op_executor.py | 36 +- .../template/CMakeLists.template | 2 +- coordinator/gscoordinator/utils.py | 335 ++-- coordinator/requirements.txt | 1 - coordinator/setup.py | 2 +- docs/zh/loading_graph.rst | 4 +- .../assembly/src/bin/graphscope/giectl | 58 +- interactive_engine/tests/function_test.sh | 6 +- k8s/Makefile | 40 +- .../manylinux/Dockerfile | 4 +- .../manylinux/Makefile | 3 +- k8s/actions-runner-controller/ubuntu.yaml | 2 +- k8s/build_scripts/build_vineyard.sh | 5 +- k8s/dockerfiles/analytical.Dockerfile | 85 +- k8s/dockerfiles/coordinator.Dockerfile | 61 +- .../graphscope-dev-base.Dockerfile | 3 +- k8s/dockerfiles/graphscope-dev.Dockerfile | 5 +- k8s/dockerfiles/graphscope-store.Dockerfile | 13 +- .../interactive-experimental.Dockerfile | 7 +- k8s/dockerfiles/interactive.Dockerfile | 49 +- k8s/dockerfiles/learning.Dockerfile | 51 +- k8s/dockerfiles/vineyard-dev.Dockerfile | 40 +- k8s/dockerfiles/vineyard-runtime.Dockerfile | 18 +- k8s/internal/Makefile | 4 +- k8s/internal/jupyter.Dockerfile | 2 +- k8s/utils/precompile.py | 48 +- python/graphscope/client/rpc.py | 4 +- python/graphscope/client/session.py | 95 +- python/graphscope/config.py | 50 +- .../graphscope/deploy/kubernetes/cluster.py | 565 +++--- .../deploy/kubernetes/resource_builder.py | 1530 ++++------------- python/graphscope/deploy/kubernetes/utils.py | 54 +- python/graphscope/deploy/launcher.py | 2 +- python/graphscope/framework/utils.py | 2 +- python/graphscope/nx/conftest.py | 1 + python/graphscope/tests/conftest.py | 1 + .../tests/kubernetes/test_demo_script.py | 35 +- .../tests/kubernetes/test_resource_builder.py | 42 - .../tests/kubernetes/test_with_mars.py | 18 +- python/graphscope/tests/unittest/test_lazy.py | 1 + .../tests/unittest/test_scalability.py | 1 + .../graphscope/tests/unittest/test_session.py | 1 + ...ification_on_citation_network_on_k8s.ipynb | 13 +- ...ification_on_citation_network_on_k8s.ipynb | 6 +- 65 files changed, 3426 insertions(+), 3432 deletions(-) create mode 100644 .github/workflows/build-graphscope-images-linux.yml create mode 100644 .github/workflows/k8s-ci-dummy.yml create mode 100644 .github/workflows/k8s-ci.yml rename .github/workflows/{ci-dummy.yml => local-ci-dummy.yml} (86%) rename .github/workflows/{ci.yml => local-ci.yml} (61%) create mode 100644 coordinator/gscoordinator/cluster_builder.py delete mode 100644 python/graphscope/tests/kubernetes/test_resource_builder.py diff --git a/.dockerignore b/.dockerignore index 98d8cd3b27fc..58101b45d6a9 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,11 +14,16 @@ python/proto # Gar file **/*.gar +# Git +# .git +.cache + # dot file *.dot # Protobuf GRPC -**/*.pb.* +# Needed by build analytical image in CI, the COPY should copy the generated proto files +# **/*.pb.* **/*_pb2.py **/*_pb2_grpc.py diff --git a/.github/workflows/build-graphscope-images-linux.yml b/.github/workflows/build-graphscope-images-linux.yml new file mode 100644 index 000000000000..80974c74b3b4 --- /dev/null +++ b/.github/workflows/build-graphscope-images-linux.yml @@ -0,0 +1,144 @@ +name: Build GraphScope Images on Linux + +# on: [push, pull_request] +on: + workflow_dispatch: + schedule: + # The notifications for scheduled workflows are sent to the user who + # last modified the cron syntax in the workflow file. + # Trigger the workflow at 03:00(CST) every day. + - cron: '00 19 * * *' + push: + tags: + - "v*" + +env: + REGISTRY: registry.cn-hongkong.aliyuncs.com + +jobs: + build-image: + if: (github.ref == 'refs/heads/main' && github.repository == 'alibaba/GraphScope') || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && github.repository == 'alibaba/GraphScope') + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v3 + with: + submodules: true + + - name: Add envs to GITHUB_ENV + run: | + short_sha=$(git rev-parse --short HEAD) + echo "SHORT_SHA=${short_sha}" >> $GITHUB_ENV + + - name: Build GraphScope Image + run: | + cd ${GITHUB_WORKSPACE}/k8s + make coordinator CI=false + make analytical CI=false + make analytical-java CI=false + make interactive-frontend CI=false + make interactive-executor CI=false + make learning CI=false + + # make jupyter-image + # cd ${GITHUB_WORKSPACE} + # docker build --build-arg CI=${CI} -t graphscope/jupyter:${SHORT_SHA} -f ./k8s/internal/jupyter.Dockerfile . + + # dataset image doesn't changed, we can just use the latest one + # make dataset-image + + - name: Release Nightly Image + if: ${{ github.ref == 'refs/heads/main' && github.repository == 'alibaba/GraphScope' }} + env: + docker_password: ${{ secrets.DOCKER_PASSWORD }} + docker_username: ${{ secrets.DOCKER_USER }} + run: | + echo "${docker_password}" | sudo docker login --username="${docker_username}" ${{ env.REGISTRY }} --password-stdin + # docker tag: 0.15.0 -> 0.15.0a20220808 + time=$(date "+%Y%m%d") + version=$(cat ${GITHUB_WORKSPACE}/VERSION) + tag="${version}a${time}" + + # graphscope image + sudo docker tag graphscope/coordinator:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/coordinator:${tag} + sudo docker tag graphscope/analytical:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/analytical:${tag} + sudo docker tag graphscope/analytical-java:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/analytical-java:${tag} + sudo docker tag graphscope/interactive-frontend:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/interactive-frontend:${tag} + sudo docker tag graphscope/interactive-executor:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/interactive-executor:${tag} + sudo docker tag graphscope/learning:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/learning:${tag} + + sudo docker push ${{ env.REGISTRY }}/graphscope/coordinator:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/analytical:${tag} + # sudo docker push ${{ env.REGISTRY }}/graphscope/analytical-java:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/interactive-frontend:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/interactive-executor:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/learning:${tag} + + # jupyter image + # sudo docker tag graphscope/jupyter:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/jupyter:${tag} + # sudo docker push ${{ env.REGISTRY }}/graphscope/jupyter:${tag} + # dataset image + # sudo docker tag graphscope/dataset:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/dataset:${tag} + # sudo docker push ${{ env.REGISTRY }}/graphscope/dataset:${tag} + + - name: Extract Tag Name + if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && github.repository == 'alibaba/GraphScope' }} + id: tag + run: echo "TAG=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + + - name: Release Image + if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && github.repository == 'alibaba/GraphScope' }} + env: + docker_password: ${{ secrets.DOCKER_PASSWORD }} + docker_username: ${{ secrets.DOCKER_USER }} + run: | + echo "${docker_password}" | sudo docker login --username="${docker_username}" ${{ env.REGISTRY }} --password-stdin + + # Release version tag + tag=${{ steps.tag.outputs.TAG }} + # graphscope image + sudo docker tag graphscope/coordinator:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/coordinator:${tag} + sudo docker tag graphscope/analytical:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/analytical:${tag} + sudo docker tag graphscope/analytical-java:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/analytical-java:${tag} + sudo docker tag graphscope/interactive-frontend:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/interactive-frontend:${tag} + sudo docker tag graphscope/interactive-executor:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/interactive-executor:${tag} + sudo docker tag graphscope/learning:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/learning:${tag} + + sudo docker push ${{ env.REGISTRY }}/graphscope/coordinator:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/analytical:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/analytical-java:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/interactive-frontend:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/interactive-executor:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/learning:${tag} + + # jupyter image + # sudo docker tag graphscope/jupyter:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/jupyter:${tag} + # sudo docker tag graphscope/jupyter:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/jupyter:latest + # sudo docker push ${{ env.REGISTRY }}/graphscope/jupyter:${tag} + + # dataset image + # Note! dataset image are built mannually just use the latest one. + sudo docker pull ${{ env.REGISTRY }}/graphscope/dataset:latest + sudo docker tag ${{ env.REGISTRY }}/graphscope/dataset:latest ${{ env.REGISTRY }}/graphscope/dataset:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/dataset:${tag} + + # Release the latest tag + tag=latest + # graphscope image + sudo docker tag graphscope/coordinator:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/coordinator:${tag} + sudo docker tag graphscope/analytical:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/analytical:${tag} + sudo docker tag graphscope/analytical-java:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/analytical-java:${tag} + sudo docker tag graphscope/interactive-frontend:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/interactive-frontend:${tag} + sudo docker tag graphscope/interactive-executor:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/interactive-executor:${tag} + sudo docker tag graphscope/learning:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/learning:${tag} + + sudo docker push ${{ env.REGISTRY }}/graphscope/coordinator:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/analytical:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/analytical-java:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/interactive-frontend:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/interactive-executor:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/learning:${tag} + + # jupyter image + # sudo docker tag graphscope/jupyter:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/jupyter:${tag} + # sudo docker push ${{ env.REGISTRY }}/graphscope/jupyter:${tag} diff --git a/.github/workflows/build-graphscope-wheels-linux.yml b/.github/workflows/build-graphscope-wheels-linux.yml index ae2c511437fe..93b7423d0226 100644 --- a/.github/workflows/build-graphscope-wheels-linux.yml +++ b/.github/workflows/build-graphscope-wheels-linux.yml @@ -133,8 +133,8 @@ jobs: version=$(cat ${GITHUB_WORKSPACE}/VERSION) tag="${version}a${time}" # graphscope image - sudo docker tag graphscope/graphscope:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/graphscope:${tag} - sudo docker push ${{ env.REGISTRY }}/graphscope/graphscope:${tag} + # sudo docker tag graphscope/graphscope:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/graphscope:${tag} + # sudo docker push ${{ env.REGISTRY }}/graphscope/graphscope:${tag} # jupyter image sudo docker tag graphscope/jupyter:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/jupyter:${tag} sudo docker push ${{ env.REGISTRY }}/graphscope/jupyter:${tag} @@ -157,16 +157,20 @@ jobs: # graphscope image tag=${{ steps.tag.outputs.TAG }} # graphscope image - sudo docker tag graphscope/graphscope:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/graphscope:${tag} - sudo docker push ${{ env.REGISTRY }}/graphscope/graphscope:${tag} + # sudo docker tag graphscope/graphscope:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/graphscope:${tag} + # sudo docker push ${{ env.REGISTRY }}/graphscope/graphscope:${tag} # jupyter image sudo docker tag graphscope/jupyter:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/jupyter:${tag} sudo docker push ${{ env.REGISTRY }}/graphscope/jupyter:${tag} # dataset image # Note! dataset image are built mannually just use the latest one. - sudo docker pull ${{ env.REGISTRY }}/graphscope/dataset:latest - sudo docker tag ${{ env.REGISTRY }}/graphscope/dataset:latest ${{ env.REGISTRY }}/graphscope/dataset:${tag} - sudo docker push ${{ env.REGISTRY }}/graphscope/dataset:${tag} + # sudo docker pull ${{ env.REGISTRY }}/graphscope/dataset:latest + # sudo docker tag ${{ env.REGISTRY }}/graphscope/dataset:latest ${{ env.REGISTRY }}/graphscope/dataset:${tag} + # sudo docker push ${{ env.REGISTRY }}/graphscope/dataset:${tag} + + tag=latest + sudo docker tag graphscope/jupyter:${SHORT_SHA} ${{ env.REGISTRY }}/graphscope/jupyter:${tag} + sudo docker push ${{ env.REGISTRY }}/graphscope/jupyter:${tag} ubuntu-python-test: if: ${{ github.ref == 'refs/heads/main' && github.repository == 'alibaba/GraphScope' }} diff --git a/.github/workflows/gaia.yml b/.github/workflows/gaia.yml index c23fc42846d9..1f2ba67ad09c 100644 --- a/.github/workflows/gaia.yml +++ b/.github/workflows/gaia.yml @@ -104,7 +104,7 @@ jobs: cd ${GITHUB_WORKSPACE}/interactive_engine/compiler && ./ir_exprimental_pattern_ci.sh - name: Upload GIE log - if: always() + if: failure() uses: actions/upload-artifact@v3 with: name: gie-log diff --git a/.github/workflows/k8s-ci-dummy.yml b/.github/workflows/k8s-ci-dummy.yml new file mode 100644 index 000000000000..879d522dabdd --- /dev/null +++ b/.github/workflows/k8s-ci-dummy.yml @@ -0,0 +1,115 @@ +name: GraphScope CI on Kubernetes (Dummy) + +on: + pull_request: + branches: + - main + paths: + - '**' + - '!.github/workflows/ci.yml' + - '!Makefile' + - '!analytical_engine/**' + - '!charts/**' + - '!coordinator/**' + - '!interactive_engine/**' + - '!k8s/**' + - '!learning_engine/**' + - '!proto/**' + - '!python/**' + - '**.md' + - '**.rst' + +concurrency: + group: ${{ github.repository }}-${{ github.event.number || github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +env: + GS_IMAGE: registry.cn-hongkong.aliyuncs.com/graphscope/graphscope + +jobs: + # JOB to run change detection + changes: + runs-on: ubuntu-20.04 + # Set job outputs to values from filter step + outputs: + gae-python: ${{ steps.filter.outputs.gae-python }} + networkx: ${{ steps.filter.outputs.networkx }} + gie-function-test: ${{ steps.filter.outputs.gie-function-test }} + steps: + # For push it's necessary to checkout the code + - uses: actions/checkout@v3 + # For pull requests it's not necessary to checkout the code + - uses: dorny/paths-filter@v2 + id: filter + with: + base: main # Change detection against merge-base with main before push + filters: | + gae-python: + - 'proto/**' + - 'analytical_engine/**' + - 'python/graphscope/analytical/**' + - 'python/graphscope/client/**' + - 'python/graphscope/dataset/**' + - 'python/graphscope/deploy/**' + - 'python/graphscope/framework/**' + - 'python/graphscope/tests/unittest/**' + - 'coordinator/gscoordinator/**' + - '.github/workflows/ci.yml' + networkx: + - 'analytical_engine/apps/**' + - 'analytical_engine/frame/**' + - 'analytical_engine/core/**' + - 'python/graphscope/nx/**' + gie-function-test: + - 'interactive_engine/**' + - 'python/graphscope/interactive/**' + - '.github/workflows/ci.yml' + + build-analytical: + runs-on: ubuntu-20.04 + needs: [build-wheels, changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - run: 'echo "No action required" ' + + build-analytical-java: + runs-on: ubuntu-20.04 + needs: [build-wheels, changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - run: 'echo "No action required" ' + + build-interactive: + runs-on: ubuntu-20.04 + needs: [build-wheels, changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - run: 'echo "No action required" ' + + build-learning: + runs-on: ubuntu-20.04 + needs: [build-wheels, changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - run: 'echo "No action required" ' + + # build-coordinator: + # runs-on: ubuntu-20.04 + # needs: [build-wheels, changes] + # if: ${{ github.repository == 'alibaba/GraphScope' }} + # steps: + # - run: 'echo "No action required" ' + + k8s-test: + runs-on: ubuntu-20.04 + if: ${{ github.repository == 'alibaba/GraphScope' }} + needs: [build-analytical, build-analytical-java, build-interactive, build-learning] + steps: + - run: 'echo "No action required" ' + + gie-test: + runs-on: ubuntu-20.04 + needs: [build-analytical, build-analytical-java, build-interactive, build-learning] + if: ${{ (needs.changes.outputs.gie-function-test == 'false' || github.ref == 'refs/heads/main') && github.repository == 'alibaba/GraphScope' }} + steps: + - run: 'echo "No action required" ' diff --git a/.github/workflows/k8s-ci.yml b/.github/workflows/k8s-ci.yml new file mode 100644 index 000000000000..c991143459d7 --- /dev/null +++ b/.github/workflows/k8s-ci.yml @@ -0,0 +1,630 @@ +name: GraphScope CI on Kubernetes + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths-ignore: + - 'CONTRIBUTORS' + - 'LICENSE' + - 'NOTICE.txt' + - '**.md' + - '**.rst' + - 'docs/**' + - 'demo/**' + - 'scripts/**' + - 'tutorials/**' + pull_request: + branches: + - main + paths: + - '.github/workflows/ci.yml' + - 'Makefile' + - 'analytical_engine/**' + - 'charts/**' + - 'coordinator/**' + - 'interactive_engine/**' + - 'k8s/**' + - 'learning_engine/**' + - 'proto/**' + - 'python/**' + - '!**.md' + - '!**.rst' + +concurrency: + group: ${{ github.repository }}-${{ github.event.number || github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +env: + GS_IMAGE: registry.cn-hongkong.aliyuncs.com/graphscope/graphscope + +jobs: + # JOB to run change detection + changes: + runs-on: ubuntu-20.04 + # Set job outputs to values from filter step + outputs: + gae-python: ${{ steps.filter.outputs.gae-python }} + networkx: ${{ steps.filter.outputs.networkx }} + gie-function-test: ${{ steps.filter.outputs.gie-function-test }} + steps: + # For push it's necessary to checkout the code + - uses: actions/checkout@v3 + # For pull requests it's not necessary to checkout the code + - uses: dorny/paths-filter@v2 + id: filter + with: + base: main # Change detection against merge-base with main before push + filters: | + gae-python: + - 'proto/**' + - 'analytical_engine/**' + - 'python/graphscope/analytical/**' + - 'python/graphscope/client/**' + - 'python/graphscope/dataset/**' + - 'python/graphscope/deploy/**' + - 'python/graphscope/framework/**' + - 'python/graphscope/tests/unittest/**' + - 'coordinator/gscoordinator/**' + - '.github/workflows/ci.yml' + networkx: + - 'analytical_engine/apps/**' + - 'analytical_engine/frame/**' + - 'analytical_engine/core/**' + - 'python/graphscope/nx/**' + gie-function-test: + - 'interactive_engine/**' + - 'python/graphscope/interactive/**' + - '.github/workflows/ci.yml' + gie-k8s-failover-test: + - 'interactive_engine/**' + - 'charts/ir-standalone/**' + + - name: Cpp Format and Lint Check + run: | + # install clang-format + sudo curl -L https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-22538c65/clang-format-8_linux-amd64 --output /usr/bin/clang-format + sudo chmod +x /usr/bin/clang-format + + # collect the source files + cd analytical_engine/ + files=$(find ./apps ./benchmarks ./core ./frame ./misc ./test \( -name "*.h" -o -name "*.cc" \)) + + # run format + clang-format -i --style=file $(echo $files) + + # validate format + function prepend() { while read line; do echo "${1}${line}"; done; } + + GIT_DIFF=$(git diff --ignore-submodules) + if [[ -n $GIT_DIFF ]]; then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "| clang-format failures found!" + echo "|" + echo "$GIT_DIFF" | prepend "| " + echo "|" + echo "| Run: " + echo "|" + echo "| make gsa_clformat" + echo "|" + echo "| to fix this error." + echo "|" + echo "| Ensure you are working with clang-format-8, which can be obtained from" + echo "|" + echo "| https://github.com/muttleyxd/clang-tools-static-binaries/releases" + echo "|" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + exit -1 + fi + + # validate cpplint + function ec() { [[ "$1" == "-h" ]] && { shift && eval $* > /dev/null 2>&1; ec=$?; echo $ec; } || eval $*; ec=$?; } + + # run cpplint + ec ./misc/cpplint.py $(echo $files) + if [[ "$ec" != "0" ]]; then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "| cpplint failures found! Run: " + echo "|" + echo "| make gsa_cpplint" + echo "|" + echo "| to fix this error." + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + exit -1 + fi + + - name: Python Format and Lint Check + run: | + echo "Checking formatting for $GITHUB_REPOSITORY" + pip3 install -r coordinator/requirements-dev.txt + pushd python + python3 -m isort --check --diff . + python3 -m black --check --diff . + python3 -m flake8 . + popd + pushd coordinator + python3 -m isort --check --diff . + python3 -m black --check --diff . + python3 -m flake8 . + + - name: Setup Java11 + uses: actions/setup-java@v3 + with: + distribution: 'zulu' + java-version: '11' + + - name: Java Format and Lint Check + run: | + wget https://github.com/google/google-java-format/releases/download/v1.13.0/google-java-format-1.13.0-all-deps.jar + + files_to_format=$(git ls-files *.java) + + # run formatter in-place + java -jar ${GITHUB_WORKSPACE}/google-java-format-1.13.0-all-deps.jar --aosp --skip-javadoc-formatting -i $files_to_format + + # validate format + function prepend() { while read line; do echo "${1}${line}"; done; } + + GIT_DIFF=$(git diff --ignore-submodules) + if [[ -n $GIT_DIFF ]]; then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "| google-java-format failures found!" + echo "|" + echo "$GIT_DIFF" | prepend "| " + echo "|" + echo "| Run: " + echo "|" + echo '| java -jar google-java-format-1.13.0-all-deps.jar --aosp --skip-javadoc-formatting -i $(git ls-files **/*.java)' + echo "|" + echo "| to fix this error." + echo "|" + echo "| Ensure you are working with google-java-format-1.13.0, which can be obtained from" + echo "|" + echo "| https://github.com/google/google-java-format/releases/download/v1.13.0/google-java-format-1.13.0-all-deps.jar" + echo "|" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + exit -1 + fi + + build-analytical: + runs-on: [self-hosted, manylinux2014] + needs: [changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - uses: actions/checkout@v3 + + - name: Build Artifact + run: | + export GRAPHSCOPE_HOME=${{ github.workspace }}/install + mkdir ${GRAPHSCOPE_HOME} + make analytical-install INSTALL_PREFIX=${GRAPHSCOPE_HOME} + strip ${GRAPHSCOPE_HOME}/bin/grape_engine + strip ${GRAPHSCOPE_HOME}/lib/*.so + python3 ./k8s/utils/precompile.py --graph --output_dir ${GRAPHSCOPE_HOME}/builtin + strip ${GRAPHSCOPE_HOME}/builtin/*/*.so + + - name: Upload Artifacts + uses: actions/upload-artifact@v3 + with: + name: analytical + path: ${{ github.workspace }}/install + retention-days: 5 + + build-analytical-java: + runs-on: [self-hosted, manylinux2014] + needs: [changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - uses: actions/checkout@v3 + - uses: actions/cache@v3 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Build Artifact + run: | + export GRAPHSCOPE_HOME=${{ github.workspace }}/install + mkdir ${GRAPHSCOPE_HOME} + make analytical-java-install INSTALL_PREFIX=${GRAPHSCOPE_HOME} + strip ${GRAPHSCOPE_HOME}/bin/grape_engine + strip ${GRAPHSCOPE_HOME}/bin/graphx_runner + strip ${GRAPHSCOPE_HOME}/lib/*.so + python3 ./k8s/utils/precompile.py --graph --output_dir ${GRAPHSCOPE_HOME}/builtin + strip ${GRAPHSCOPE_HOME}/builtin/*/*.so + + - name: Upload Artifacts + uses: actions/upload-artifact@v3 + with: + name: analytical-java + path: ${{ github.workspace }}/install + retention-days: 5 + + build-interactive: + runs-on: [self-hosted, manylinux2014] + needs: [changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - uses: actions/checkout@v3 + - uses: actions/cache@v3 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + ~/.cache/sccache + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + + - name: Build Artifact + run: | + mkdir install + export SCCACHE_DIR=~/.cache/sccache + export RUSTC_WRAPPER=/usr/local/bin/sccache + sccache --start-server + make interactive-install BUILD_TYPE="debug" INSTALL_PREFIX=${{ github.workspace }}/install + strip install/bin/gaia_executor + sccache --show-stats + + - name: Upload Artifacts + uses: actions/upload-artifact@v3 + with: + name: interactive + path: ${{ github.workspace }}/install + retention-days: 5 + + build-learning: + runs-on: [self-hosted, manylinux2014] + needs: [changes] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - uses: actions/checkout@v3 + with: + submodules: true + + - uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Build Artifact + run: | + mkdir install + make learning-install INSTALL_PREFIX=${{ github.workspace }}/install + python3 -m pip install "numpy==1.18.5" "pandas<1.5.0" "grpcio<=1.43.0,>=1.40.0" "grpcio-tools<=1.43.0,>=1.40.0" wheel + cd ${{ github.workspace }}/python + python3 setup.py bdist_wheel + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${{ github.workspace }}/learning_engine/graph-learn/graphlearn/built/lib + auditwheel repair --plat=manylinux2014_x86_64 dist/*.whl + cp wheelhouse/*.whl ${{ github.workspace }}/install/ + cd ${{ github.workspace }}/coordinator + python3 setup.py bdist_wheel + cp dist/*.whl ${{ github.workspace }}/install/ + + - name: Upload Artifacts + uses: actions/upload-artifact@v3 + with: + name: learning + path: ${{ github.workspace }}/install + retention-days: 5 + + # build-coordinator: + # runs-on: ubuntu-latest + # needs: [changes] + # if: false # Wheels of learning includes coordinator and client + # steps: + # - uses: actions/checkout@v3g + + k8s-test: + runs-on: [self-hosted, ubuntu2004] + if: ${{ github.repository == 'alibaba/GraphScope' }} + needs: [build-analytical, build-analytical-java, build-interactive, build-learning] + steps: + - uses: actions/checkout@v3 + + - uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - uses: actions/download-artifact@v3 + with: + path: artifacts + + - name: Add envs to GITHUB_ENV + run: | + short_sha=$(git rev-parse --short HEAD) + echo "SHORT_SHA=${short_sha}" >> $GITHUB_ENV + + - name: Build Images + run: | + cd ${GITHUB_WORKSPACE}/k8s + # Use a dummy builder image (tag=ci, which is actually a busybox) to reduce time and space to pull the builder + make graphscope CI=true VERSION=${SHORT_SHA} REGISTRY=registry-vpc.cn-hongkong.aliyuncs.com BUILDER_VERSION=ci + + - name: Install Python dependencies + run: | + cd ${GITHUB_WORKSPACE}/python + pip3 install -r requirements.txt + pip3 install pytest pytest-cov pytest-timeout + + # build python client proto + cd ${GITHUB_WORKSPACE}/python + python3 setup.py build_proto + + # install mars + # python3 -m pip install pymars==0.8.0 + + - name: Setup SSH + run: | + ssh-keygen -t rsa -f ~/.ssh/id_rsa -N '' + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + chmod og-wx ~/.ssh/authorized_keys + echo "StrictHostKeyChecking no" >> ~/.ssh/config + sudo /etc/init.d/ssh start + + - name: Kubernetes Test + env: + GS_TEST_DIR: ${{ github.workspace }}/gstest + run: | + # download dataset + git clone -b master --single-branch --depth=1 https://github.com/7br/gstest.git ${GS_TEST_DIR} + + minikube start --base-image='registry-vpc.cn-hongkong.aliyuncs.com/graphscope/kicbase:v0.0.30' \ + --cpus='12' --memory='32000mb' --disk-size='40000mb' \ + --mount=true --mount-string="${GS_TEST_DIR}:${GS_TEST_DIR}" + + export GS_REGISTRY="" + export GS_TAG=${SHORT_SHA} + minikube image load graphscope/coordinator:${SHORT_SHA} + echo "Loaded coordinator" + minikube image load graphscope/analytical:${SHORT_SHA} + echo "Loaded analytical" + minikube image load graphscope/interactive-frontend:${SHORT_SHA} + echo "Loaded frontend" + minikube image load graphscope/interactive-executor:${SHORT_SHA} + echo "Loaded executor" + minikube image load graphscope/learning:${SHORT_SHA} + echo "loaded learning" + cd ${GITHUB_WORKSPACE}/python + python3 -m pytest --ignore=./graphscope/tests/kubernetes/test_store_service.py \ + --cov=graphscope --cov-config=.coveragerc --cov-report=xml \ + --cov-report=term --exitfirst -s -vvv --log-cli-level=INFO \ + ./graphscope/tests/kubernetes + + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 + if: false + + - name: Upload Coverage + uses: codecov/codecov-action@v3 + with: + file: ./python/coverage.xml + fail_ci_if_error: false + + - name: Helm Test + run: | + cd charts + helm install graphscope --set image.registry="",image.tag=${SHORT_SHA} \ + ./graphscope + helm test graphscope --timeout 5m0s + export NODE_IP=$(kubectl get pod -lgraphscope.coordinator.name=coordinator-graphscope -ojsonpath="{.items[0].status.hostIP}") + export NODE_PORT=$(kubectl get services coordinator-service-graphscope -ojsonpath="{.spec.ports[0].nodePort}") + echo "GraphScope service listen on ${NODE_IP}:${NODE_PORT}" + export GS_ADDR=${NODE_IP}:${NODE_PORT} + cd ${GITHUB_WORKSPACE}/python + python3 -m pytest -s -vvv ./graphscope/tests/kubernetes/test_demo_script.py -k test_helm_installation + + - name: HDFS test + env: + JAVA_HOME: /usr/lib/jvm/default-java + GS_TEST_DIR: ${{ github.workspace }}/gstest + run: | + export GS_REGISTRY="" + export GS_TAG=${SHORT_SHA} + # install hadoop HDFS + tar -zxf /home/runner/hadoop-2.10.1.tar.gz -C /tmp/ + cd ${GITHUB_WORKSPACE}/.github/workflows/hadoop_scripts + ./prepare_hadoop.sh /tmp/hadoop-2.10.1 + export PATH=${PATH}:/tmp/hadoop-2.10.1/bin + + # upload data to HDFS + hadoop fs -mkdir /ldbc_sample || true + hadoop fs -chmod 777 /ldbc_sample + hadoop fs -put ${GS_TEST_DIR}/ldbc_sample/person_0_0.csv /ldbc_sample/person_0_0.csv + hadoop fs -put ${GS_TEST_DIR}/ldbc_sample/person_knows_person_0_0.csv /ldbc_sample/person_knows_person_0_0.csv + + # validate hadoop + hadoop fs -ls /ldbc_sample + + # prepare CI environments + export HDFS_TEST_DIR=hdfs:///ldbc_sample + export HDFS_HOST=$(hostname -I | awk '{print $1}') + + # run test + cd ${GITHUB_WORKSPACE}/python + python3 -m pytest -s -vvv ./graphscope/tests/kubernetes/test_demo_script.py -k test_demo_on_hdfs + # Check the result file have successfully written to the given location + # hdfs dfs -test -e /ldbc_sample/res.csv_0 && hdfs dfs -test -e /ldbc_sample/res.csv_1 + + gie-test: + runs-on: [self-hosted, ubuntu2004] + needs: [build-analytical, build-analytical-java, build-interactive, build-learning] + if: ${{ needs.changes.outputs.gie-function-test == 'true' && github.repository == 'alibaba/GraphScope' }} + steps: + - uses: actions/checkout@v3 + + - uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - uses: actions/download-artifact@v3 + with: + path: artifacts + + - name: Add envs to GITHUB_ENV + run: | + short_sha=$(git rev-parse --short HEAD) + echo "SHORT_SHA=${short_sha}" >> $GITHUB_ENV + + - name: Build Images + run: | + cd ${GITHUB_WORKSPACE}/k8s + # Use a dummy builder image (tag=ci, which is actually a busybox) to reduce time and space to pull the builder + make graphscope CI=true VERSION=${SHORT_SHA} REGISTRY=registry-vpc.cn-hongkong.aliyuncs.com BUILDER_VERSION=ci + + - name: Install Python dependencies + run: | + cd ${GITHUB_WORKSPACE}/python + pip3 install -r requirements.txt + pip3 install pytest pytest-cov pytest-timeout + + # build python client proto + python3 setup.py build_proto + + - name: Run Function Test + run: | + export GS_TEST_DIR=${GITHUB_WORKSPACE}/interactive_engine/tests/src/main/resources + minikube start --base-image='registry-vpc.cn-hongkong.aliyuncs.com/graphscope/kicbase:v0.0.30' \ + --cpus='12' --memory='32000mb' --disk-size='40000mb' \ + --mount=true --mount-string="${GS_TEST_DIR}:${GS_TEST_DIR}" + + export GS_REGISTRY="" + export GS_TAG=${SHORT_SHA} + minikube image load graphscope/coordinator:${SHORT_SHA} + minikube image load graphscope/analytical:${SHORT_SHA} + minikube image load graphscope/interactive-frontend:${SHORT_SHA} + minikube image load graphscope/interactive-executor:${SHORT_SHA} + minikube image load graphscope/learning:${SHORT_SHA} + + export PYTHONPATH=${GITHUB_WORKSPACE}/python:${PYTHONPATH} + cd ${GITHUB_WORKSPACE}/interactive_engine && mvn clean install --quiet -DskipTests -Drust.compile.skip=true -P graphscope,graphscope-assembly + cd ${GITHUB_WORKSPACE}/interactive_engine/tests + # ./function_test.sh 8111 1 + ./function_test.sh 8112 2 + + build-gie-experimental: + # Require the user id of the self-hosted is 1001, which may need to be + # configured manually when a new self-hosted runner is added. + runs-on: [self-hosted, manylinux2014] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - uses: actions/checkout@v3 + + - uses: actions/cache@v3 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + ~/.cache/sccache + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + + - name: Build GIE Experimental Artifacts + run: | + source ~/.bashrc + export RPC_TARGET=start_rpc_server_k8s + cd interactive_engine/compiler && make build rpc.target=${RPC_TARGET} + cd ${GITHUB_WORKSPACE} + strip interactive_engine/executor/ir/target/release/${RPC_TARGET} + strip interactive_engine/executor/ir/target/release/libir_core.so + tar -czf artifacts.tar.gz interactive_engine/compiler/target/libs \ + interactive_engine/compiler/target/compiler-0.0.1-SNAPSHOT.jar \ + interactive_engine/compiler/conf \ + interactive_engine/compiler/set_properties.sh \ + interactive_engine/executor/ir/target/release/libir_core.so \ + interactive_engine/executor/ir/target/release/${RPC_TARGET} + + - name: Upload Artifact + uses: actions/upload-artifact@v3 + with: + name: gie-experimental + path: | + artifacts.tar.gz + retention-days: 5 + + gie-k8s-failover-test: + needs: [build-gie-experimental] + runs-on: [self-hosted, ubuntu2004] + if: ${{ github.repository == 'alibaba/GraphScope' }} + steps: + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + with: + name: gie-experimental + path: artifacts + + - name: Display structure of downloaded files + run: ls -R + working-directory: artifacts + + - name: Add envs to GITHUB_ENV + run: | + short_sha=$(git rev-parse --short HEAD) + echo "SHORT_SHA=${short_sha}" >> $GITHUB_ENV + + - name: Prepare Docker Image + run: | + docker build \ + -t registry.cn-hongkong.aliyuncs.com/graphscope/interactive-experimental:${SHORT_SHA} \ + -f .github/workflows/docker/interactive-experimental-local-artifacts.Dockerfile . + + - name: Prepare Cluster and Data + env: + GS_TEST_DIR: ${{ github.workspace }}/gstest + STORE_DATA_PATH: /tmp/data + GIE_IMAGE: registry.cn-hongkong.aliyuncs.com/graphscope/interactive-experimental + run: | + # prepare graph data + git clone -b master --single-branch --depth=1 https://github.com/7br/gstest.git ${GS_TEST_DIR} + mkdir -p ${STORE_DATA_PATH} + cp -r ${GS_TEST_DIR}/modern_graph_exp_bin/* ${STORE_DATA_PATH} + + # prepare minikube cluster + minikube start --base-image='registry-vpc.cn-hongkong.aliyuncs.com/graphscope/kicbase:v0.0.30' \ + --cpus='12' --memory='32000mb' --disk-size='40000mb' \ + --mount=true --mount-string="${STORE_DATA_PATH}:${STORE_DATA_PATH}" + minikube image load ${GIE_IMAGE}:${SHORT_SHA} + + # install python gremlin client + pip install gremlinpython + + - name: Run K8S Failover Test + run: | + cd ${GITHUB_WORKSPACE}/charts + # create local persistent volume which contains graph data for test + kubectl apply -f ./ir-standalone/tools/pvc.yaml + # create gie instance (compiler & executor & exp storage) + helm install test ./ir-standalone \ + --set image.repository=graphscope/interactive-experimental \ + --set image.tag=${SHORT_SHA} --set storageType=Experimental \ + --set schemaConfig=expr_modern_schema.json \ + --set store.replicaCount=2 \ + --set frontend.service.type=NodePort + # run failover test + cd ${GITHUB_WORKSPACE}/interactive_engine/compiler && ./ir_k8s_failover_ci.sh default test-graphscope-store 2 1 diff --git a/.github/workflows/ci-dummy.yml b/.github/workflows/local-ci-dummy.yml similarity index 86% rename from .github/workflows/ci-dummy.yml rename to .github/workflows/local-ci-dummy.yml index a0c69e5f7705..60f1ef4fef87 100644 --- a/.github/workflows/ci-dummy.yml +++ b/.github/workflows/local-ci-dummy.yml @@ -1,4 +1,4 @@ -name: GraphScope CI (Dummy) +name: GraphScope CI on Local (Dummy) on: pull_request: @@ -104,17 +104,3 @@ jobs: deployment: ["standalone", "distributed"] steps: - run: 'echo "No action required" ' - - gie-test: - runs-on: ubuntu-20.04 - needs: [build-wheels, changes] - if: ${{ (needs.changes.outputs.gie-function-test == 'false' || github.ref == 'refs/heads/main') && github.repository == 'alibaba/GraphScope' }} - steps: - - run: 'echo "No action required" ' - - k8s-test: - runs-on: ubuntu-20.04 - if: ${{ github.repository == 'alibaba/GraphScope' }} - needs: [build-wheels] - steps: - - run: 'echo "No action required" ' diff --git a/.github/workflows/ci.yml b/.github/workflows/local-ci.yml similarity index 61% rename from .github/workflows/ci.yml rename to .github/workflows/local-ci.yml index e8867e58307f..e6808e59db05 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/local-ci.yml @@ -1,4 +1,4 @@ -name: GraphScope CI +name: GraphScope CI on Local on: # Trigger the workflow on push or pull request, @@ -318,7 +318,7 @@ jobs: if: false - name: Upload GIE log - if: always() + if: failure() uses: actions/upload-artifact@v3 with: name: gie-log @@ -566,306 +566,3 @@ jobs: run: | python3 -m pytest --exitfirst -s -v -m "not slow" \ $(dirname $(python3 -c "import graphscope; print(graphscope.__file__)"))/nx/readwrite/tests - - gie-test: - runs-on: [self-hosted, ubuntu2004] - needs: [build-wheels, changes] - if: ${{ needs.changes.outputs.gie-function-test == 'true' && github.repository == 'alibaba/GraphScope' }} - steps: - - uses: actions/checkout@v3 - - - uses: actions/cache@v3 - with: - path: ~/.m2/repository - key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ runner.os }}-maven- - - - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - - - uses: actions/download-artifact@v3 - with: - path: artifacts - - - name: Add envs to GITHUB_ENV - run: | - short_sha=$(git rev-parse --short HEAD) - echo "SHORT_SHA=${short_sha}" >> $GITHUB_ENV - - - name: Prepare Environment - run: | - cd ${GITHUB_WORKSPACE}/artifacts - tar -zxf ./wheel-${{ github.sha }}/client.tar.gz - tar -zxf ./wheel-${{ github.sha }}/graphscope.tar.gz - - # build graphscope image - cd ${GITHUB_WORKSPACE}/k8s/internal - make graphscope - - cd ${GITHUB_WORKSPACE}/python - pip3 install -r requirements.txt - pip3 install -r requirements-dev.txt - - # build python client proto - python3 setup.py build_proto - - - - name: Run Function Test - run: | - export GS_TEST_DIR=${GITHUB_WORKSPACE}/interactive_engine/tests/src/main/resources - minikube start --base-image='registry-vpc.cn-hongkong.aliyuncs.com/graphscope/kicbase:v0.0.30' \ - --cpus='12' --memory='32000mb' --disk-size='40000mb' \ - --mount=true --mount-string="${GS_TEST_DIR}:${GS_TEST_DIR}" - minikube image load graphscope/graphscope:${SHORT_SHA} - - export PYTHONPATH=${GITHUB_WORKSPACE}/python:${PYTHONPATH} - cd ${GITHUB_WORKSPACE}/interactive_engine && mvn clean install --quiet -DskipTests -Drust.compile.skip=true -P graphscope,graphscope-assembly - cd ${GITHUB_WORKSPACE}/interactive_engine/tests - # ./function_test.sh 8111 1 graphscope/graphscope:${SHORT_SHA} - ./function_test.sh 8112 2 graphscope/graphscope:${SHORT_SHA} - - k8s-test: - runs-on: [self-hosted, ubuntu2004] - if: ${{ github.repository == 'alibaba/GraphScope' }} - needs: [build-wheels] - steps: - - uses: actions/checkout@v3 - - - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - - - uses: actions/download-artifact@v3 - with: - path: artifacts - - - name: Add envs to GITHUB_ENV - run: | - short_sha=$(git rev-parse --short HEAD) - echo "SHORT_SHA=${short_sha}" >> $GITHUB_ENV - - - name: Prepare Environment - run: | - cd ${GITHUB_WORKSPACE}/artifacts - tar -zxf ./wheel-${{ github.sha }}/client.tar.gz - tar -zxf ./wheel-${{ github.sha }}/graphscope.tar.gz - - cd ${GITHUB_WORKSPACE}/k8s/internal - make graphscope - docker tag graphscope/graphscope:${SHORT_SHA} ${{ env.GS_IMAGE }}:${SHORT_SHA} - - cd ${GITHUB_WORKSPACE}/python - pip3 install -r requirements.txt - pip3 install -r requirements-dev.txt - - # build python client proto - cd ${GITHUB_WORKSPACE}/python - python3 setup.py build_proto - - # install mars - python3 -m pip install pymars==0.8.0 - - # install pytest - python3 -m pip install pytest pytest-cov - - - name: Setup SSH - run: | - ssh-keygen -t rsa -f ~/.ssh/id_rsa -N '' - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys - chmod og-wx ~/.ssh/authorized_keys - echo "StrictHostKeyChecking no" >> ~/.ssh/config - sudo /etc/init.d/ssh start - - - name: Kubernetes Test - env: - GS_TEST_DIR: ${{ github.workspace }}/gstest - run: | - # download dataset - git clone -b master --single-branch --depth=1 https://github.com/7br/gstest.git ${GS_TEST_DIR} - - # set GS_IMAGE - export GS_IMAGE=${{ env.GS_IMAGE }}:${SHORT_SHA} - - minikube start --base-image='registry-vpc.cn-hongkong.aliyuncs.com/graphscope/kicbase:v0.0.30' \ - --cpus='12' --memory='32000mb' --disk-size='40000mb' \ - --mount=true --mount-string="${GS_TEST_DIR}:${GS_TEST_DIR}" - - minikube image load ${{ env.GS_IMAGE }}:${SHORT_SHA} - cd ${GITHUB_WORKSPACE}/python - python3 -m pytest --ignore=./graphscope/tests/kubernetes/test_store_service.py \ - --cov=graphscope --cov-config=.coveragerc --cov-report=xml \ - --cov-report=term --exitfirst -s -vvv --log-cli-level=INFO \ - ./graphscope/tests/kubernetes - - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - if: false - - - name: Upload Coverage - uses: codecov/codecov-action@v3 - with: - file: ./python/coverage.xml - fail_ci_if_error: true - - - name: Helm Test - run: | - cd charts - helm install graphscope --set coordinator.image.tag=${SHORT_SHA} \ - --set engines.image.tag=${SHORT_SHA} \ - --set vineyard.image.tag=${SHORT_SHA} \ - --set imagePullPolicy=IfNotPresent \ - ./graphscope - helm test graphscope --timeout 5m0s - export NODE_IP=$(kubectl get pod -lgraphscope.coordinator.name=coordinator-graphscope -ojsonpath="{.items[0].status.hostIP}") - export NODE_PORT=$(kubectl get services coordinator-service-graphscope -ojsonpath="{.spec.ports[0].nodePort}") - echo "GraphScope service listen on ${NODE_IP}:${NODE_PORT}" - export GS_ADDR=${NODE_IP}:${NODE_PORT} - cd ${GITHUB_WORKSPACE}/python - python3 -m pytest -s -vvv ./graphscope/tests/kubernetes/test_demo_script.py -k test_helm_installation - - - name: HDFS test - env: - JAVA_HOME: /usr/lib/jvm/default-java - GS_TEST_DIR: ${{ github.workspace }}/gstest - run: | - export GS_IMAGE=${{ env.GS_IMAGE }}:${SHORT_SHA} - - # install hadoop HDFS - tar -zxf /home/runner/hadoop-2.10.1.tar.gz -C /tmp/ - cd ${GITHUB_WORKSPACE}/.github/workflows/hadoop_scripts - ./prepare_hadoop.sh /tmp/hadoop-2.10.1 - export PATH=${PATH}:/tmp/hadoop-2.10.1/bin - - # upload data to HDFS - hadoop fs -mkdir /ldbc_sample || true - hadoop fs -chmod 777 /ldbc_sample - hadoop fs -put ${GS_TEST_DIR}/ldbc_sample/person_0_0.csv /ldbc_sample/person_0_0.csv - hadoop fs -put ${GS_TEST_DIR}/ldbc_sample/person_knows_person_0_0.csv /ldbc_sample/person_knows_person_0_0.csv - - # validate hadoop - hadoop fs -ls /ldbc_sample - - # prepare CI environments - export HDFS_TEST_DIR=hdfs:///ldbc_sample - export HDFS_HOST=$(hostname -I | awk '{print $1}') - - # run test - cd ${GITHUB_WORKSPACE}/python - python3 -m pytest -s -vvv ./graphscope/tests/kubernetes/test_demo_script.py -k test_demo_on_hdfs - # Check the result file have successfully written to the given location - # hdfs dfs -test -e /ldbc_sample/res.csv_0 && hdfs dfs -test -e /ldbc_sample/res.csv_1 - - build-gie-experimental: - # Require the user id of the self-hosted is 1001, which may need to be - # configured manually when a new self-hosted runner is added. - runs-on: [self-hosted, manylinux2014] - if: ${{ github.repository == 'alibaba/GraphScope' }} - steps: - - uses: actions/checkout@v3 - - - uses: actions/cache@v3 - with: - path: ~/.m2/repository - key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ runner.os }}-maven- - - - uses: actions/cache@v3 - with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - ~/.cache/sccache - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - - - name: Build GIE Experimental Artifacts - run: | - source ~/.bashrc - export RPC_TARGET=start_rpc_server_k8s - cd interactive_engine/compiler && make build rpc.target=${RPC_TARGET} - cd ${GITHUB_WORKSPACE} - tar -czf artifacts.tar.gz interactive_engine/compiler/target/libs \ - interactive_engine/compiler/target/compiler-0.0.1-SNAPSHOT.jar \ - interactive_engine/compiler/conf \ - interactive_engine/compiler/set_properties.sh \ - interactive_engine/executor/ir/target/release/libir_core.so \ - interactive_engine/executor/ir/target/release/${RPC_TARGET} - - - name: Upload Artifact - uses: actions/upload-artifact@v3 - with: - name: gie-experimental - path: | - artifacts.tar.gz - retention-days: 5 - - gie-k8s-failover-test: - needs: [build-gie-experimental] - runs-on: [self-hosted, ubuntu2004] - if: ${{ github.repository == 'alibaba/GraphScope' }} - steps: - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 - with: - name: gie-experimental - path: artifacts - - - name: Display structure of downloaded files - run: ls -R - working-directory: artifacts - - - name: Add envs to GITHUB_ENV - run: | - short_sha=$(git rev-parse --short HEAD) - echo "SHORT_SHA=${short_sha}" >> $GITHUB_ENV - - - name: Prepare Docker Image - run: | - docker build \ - -t registry.cn-hongkong.aliyuncs.com/graphscope/interactive-experimental:${SHORT_SHA} \ - -f .github/workflows/docker/interactive-experimental-local-artifacts.Dockerfile . - - - name: Prepare Cluster and Data - env: - GS_TEST_DIR: ${{ github.workspace }}/gstest - STORE_DATA_PATH: /tmp/data - GIE_IMAGE: registry.cn-hongkong.aliyuncs.com/graphscope/interactive-experimental - run: | - # prepare graph data - git clone -b master --single-branch --depth=1 https://github.com/7br/gstest.git ${GS_TEST_DIR} - mkdir -p ${STORE_DATA_PATH} - cp -r ${GS_TEST_DIR}/modern_graph_exp_bin/* ${STORE_DATA_PATH} - - # prepare minikube cluster - minikube start --base-image='registry-vpc.cn-hongkong.aliyuncs.com/graphscope/kicbase:v0.0.30' \ - --cpus='12' --memory='32000mb' --disk-size='40000mb' \ - --mount=true --mount-string="${STORE_DATA_PATH}:${STORE_DATA_PATH}" - minikube image load ${GIE_IMAGE}:${SHORT_SHA} - - # install python gremlin client - pip install gremlinpython - - - name: Run K8S Failover Test - run: | - cd ${GITHUB_WORKSPACE}/charts - # create local persistent volume which contains graph data for test - kubectl apply -f ./ir-standalone/tools/pvc.yaml - # create gie instance (compiler & executor & exp storage) - helm install test ./ir-standalone \ - --set image.repository=graphscope/interactive-experimental \ - --set image.tag=${SHORT_SHA} --set storageType=Experimental \ - --set schemaConfig=expr_modern_schema.json \ - --set store.replicaCount=2 \ - --set frontend.service.type=NodePort - # run failover test - cd ${GITHUB_WORKSPACE}/interactive_engine/compiler && ./ir_k8s_failover_ci.sh default test-graphscope-store 2 1 diff --git a/Makefile b/Makefile index 5fce8d5a38a4..ea7de99224d0 100644 --- a/Makefile +++ b/Makefile @@ -153,7 +153,6 @@ $(LEARNING_DIR)/graphlearn/built/lib/libgraphlearn_shared.$(SUFFIX): -DTESTING=${BUILD_TEST} .. && \ $(MAKE) -j$(NUMPROC) -## wheels .PHONY: prepare-client graphscope-docs prepare-client: @@ -184,4 +183,3 @@ k8stest: pip3 install tensorflow==2.5.2 "pandas<1.5.0" cd $(CLIENT_DIR) && \ python3 -m pytest --cov=graphscope --cov-config=.coveragerc --cov-report=xml --cov-report=term -s -v ./graphscope/tests/kubernetes - diff --git a/README-zh.md b/README-zh.md index 7dadf251ab24..fb112bbe2198 100644 --- a/README-zh.md +++ b/README-zh.md @@ -242,18 +242,18 @@ pip3 install graphscope-client 使用 GraphScope 的第一步,我们需要在 Python 中创建一个会话(session)。 -为了方便起见,我们提供了若干示例数据集,可通过设置参数 `mount_dataset` 来挂载这些数据集到集群上,挂载路径是 Pod 中你指定的路径。如果你想使用自己的数据集,请参考 [这篇文档](docs/zh/deployment.rst) +为了方便起见,我们提供了若干示例数据集,可通过设置参数 `with_dataset` 来挂载这些数据集到集群上,挂载路径是 Pod 中 `/dataset`。如果你想使用自己的数据集,请参考 [这篇文档](docs/zh/deployment.rst) ```python import graphscope -sess = graphscope.session(mount_dataset="/dataset") +sess = graphscope.session(with_dataset=True) ``` 对于 macOS,创建会话需要使用 LoadBalancer 服务类型(默认是 NodePort)。 ```python -sess = graphscope.session(mount_dataset="/dataset", k8s_service_type="LoadBalancer") +sess = graphscope.session(with_dataset=True, k8s_service_type="LoadBalancer") ``` 会话的建立过程中,首选会在背后尝试拉起一个 `coordinator` 作为后端引擎的入口。 diff --git a/README.md b/README.md index 3be0c38efdf3..caf42d800a26 100644 --- a/README.md +++ b/README.md @@ -271,12 +271,12 @@ The figure shows the flow of execution in the cluster mode. When users run code To use GraphScope in a distributed setting, we need to establish a session in a python interpreter. -For convenience, we provide several demo datasets, and an option `mount_dataset` to mount the dataset in the graphscope cluster. The datasets will be mounted to location you assigned in the pods. If you want to use your own data on k8s cluster, please refer to [this](docs/deployment.rst). +For convenience, we provide several demo datasets, and an option `with_dataset` to mount the dataset in the graphscope cluster. The datasets will be mounted to `/dataset` in the pods. If you want to use your own data on k8s cluster, please refer to [this](docs/deployment.rst). ```python import graphscope -sess = graphscope.session(mount_dataset="/dataset") +sess = graphscope.session(with_dataset=True) ``` For macOS, the session needs to establish with the LoadBalancer service type (which is NodePort by default). @@ -284,7 +284,7 @@ For macOS, the session needs to establish with the LoadBalancer service type (wh ```python import graphscope -sess = graphscope.session(mount_dataset="/dataset", k8s_service_type="LoadBalancer") +sess = graphscope.session(with_dataset=True, k8s_service_type="LoadBalancer") ``` A session tries to launch a `coordinator`, which is the entry for the back-end engines. The coordinator manages a cluster of resources (k8s pods), and the interactive/analytical/learning engines ran on them. For each pod in the cluster, there is a vineyard instance at service for distributed data in memory. diff --git a/analytical_engine/core/loader/dynamic_to_arrow_converter.h b/analytical_engine/core/loader/dynamic_to_arrow_converter.h index 9e323fa1c61d..22a64dc9605b 100644 --- a/analytical_engine/core/loader/dynamic_to_arrow_converter.h +++ b/analytical_engine/core/loader/dynamic_to_arrow_converter.h @@ -25,6 +25,7 @@ #include #include "vineyard/graph/fragment/arrow_fragment.h" +#include "vineyard/graph/utils/table_shuffler.h" #include "core/error.h" #include "core/fragment/dynamic_fragment.h" diff --git a/charts/graphscope/templates/_helpers.tpl b/charts/graphscope/templates/_helpers.tpl index 014b2e5db214..66b8fdc10acd 100644 --- a/charts/graphscope/templates/_helpers.tpl +++ b/charts/graphscope/templates/_helpers.tpl @@ -39,6 +39,30 @@ Transform the Docker Image Registry Secret Names to string with comma separated. {{- end }} +{{/* +Return the proper image name +{{ include "graphscope.images.image" ( dict "imageRoot" .Values.path.to.the.image "DefaultTag" .DefaultTag "Component" .Values.path.to.component) }} +*/}} +{{- define "graphscope.images.image" -}} +{{- $registryName := .imageRoot.registry -}} +{{- $repositoryName := .imageRoot.repository -}} +{{- $tag := .imageRoot.tag | toString -}} +{{- $component := .Component.image.name -}} +{{- if not $tag }} +{{- if .DefaultTag }} +{{- $tag = .DefaultTag -}} +{{- else -}} +{{- $tag = "latest" -}} +{{- end -}} +{{- end -}} +{{- if $registryName }} +{{- printf "%s/%s/%s:%s" $registryName $repositoryName $component $tag -}} +{{- else -}} +{{- printf "%s/%s:%s" $repositoryName $component $tag -}} +{{- end -}} +{{- end -}} + + {{/* Unique Label of GraphScope Coordinator. */}} diff --git a/charts/graphscope/templates/coordinator.yaml b/charts/graphscope/templates/coordinator.yaml index 821ad498e31e..70cba6fca809 100644 --- a/charts/graphscope/templates/coordinator.yaml +++ b/charts/graphscope/templates/coordinator.yaml @@ -16,12 +16,8 @@ spec: {{- include "graphscope.imagePullSecrets" . | indent 6 }} containers: - name: coordinator - {{- if .Values.coordinator.image.tag }} - image: "{{ .Values.coordinator.image.name }}:{{ .Values.coordinator.image.tag }}" - {{- else }} - image: "{{ .Values.coordinator.image.name }}:{{ .Chart.AppVersion }}" - {{- end }} - imagePullPolicy: {{ .Values.imagePullPolicy }} + image: {{ include "graphscope.images.image" (dict "imageRoot" .Values.image "DefaultTag" .Chart.AppVersion "Component" .Values.coordinator) }} + imagePullPolicy: {{ .Values.image.pullPolicy }} resources: {{- toYaml .Values.coordinator.resources | nindent 10 }} env: - name: PYTHONUNBUFFERED @@ -57,8 +53,9 @@ spec: kubectl patch rolebinding/{{ include "graphscope.fullname" . }}-role-binding \ -n {{ .Release.Namespace }} \ --type json \ - --patch='[ { "op": "remove", "path": "/metadata/finalizers" } ]' - command: + --patch='[ { "op": "remove", "path": "/metadata/finalizers" } ]' && \ + /opt/rh/rh-python38/root/usr/bin/python3 -m gscoordinator.hook.prestop + args: - python3 - "-m" - gscoordinator @@ -78,37 +75,35 @@ spec: - {{ .Values.coordinator.service.type }} - "--preemptive" - {{ .Values.preemptive | quote }} - - "--k8s_gs_image" - {{- if .Values.engines.image.tag }} - - "{{ .Values.engines.image.name }}:{{ .Values.engines.image.tag }}" - {{- else }} - - "{{ .Values.engines.image.name }}:{{ .Chart.AppVersion }}" + {{- if .Values.image.registry }} + - "--k8s_image_registry" + - {{ .Values.image.registry }} {{- end }} - - "--k8s_etcd_image" - {{- if .Values.etcd.image.tag }} - - "{{ .Values.etcd.image.name }}:{{ .Values.etcd.image.tag }}" + - "--k8s_image_repository" + - {{ .Values.image.repository }} + - "--k8s_image_tag" + {{- if .Values.image.tag }} + - {{ .Values.image.tag }} {{- else }} - - "{{ .Values.etcd.image.name }}:{{ .Chart.AppVersion }}" + - {{ .Chart.AppVersion }} {{- end }} - "--k8s_image_pull_policy" - - {{ .Values.imagePullPolicy | quote }} + - {{ .Values.image.pullPolicy | quote }} + {{- if .Values.image.pullSecrets}} - "--k8s_image_pull_secrets" - {{ include "graphscope.imagePullSecretsStr" . | default (printf "''") | trimAll "\n" | quote }} + {{- end }} - "--k8s_coordinator_name" {{- $fullname := include "graphscope.fullname" . }} - {{ printf "%s-%s" "coordinator" $fullname | quote }} - "--k8s_coordinator_service_name" - {{ printf "%s-%s" "coordinator-service" $fullname | quote }} - - "--k8s_etcd_cpu" - - {{ .Values.etcd.resources.requests.cpu | quote }} - - "--k8s_etcd_mem" - - {{ .Values.etcd.resources.requests.memory }} - - "--k8s_vineyard_daemonset" {{- if .Values.vineyard.daemonset }} + - "--k8s_vineyard_daemonset" - {{ .Values.vineyard.daemonset }} - {{- else }} - - "\"\"" {{- end }} + - "--k8s_vineyard_image" + - {{ .Values.vineyard.image.name }}:{{ .Values.vineyard.image.tag }} - "--k8s_vineyard_cpu" - {{ .Values.vineyard.resources.requests.cpu | quote }} - "--k8s_vineyard_mem" @@ -119,11 +114,9 @@ spec: - {{ .Values.engines.resources.requests.cpu | quote }} - "--k8s_engine_mem" - {{ .Values.engines.resources.requests.memory }} - - '--k8s_volumes' {{- if .Values.volumes.enabled }} + - '--k8s_volumes' - {{ mustToJson .Values.volumes.items | quote}} - {{- else }} - - "{}" {{- end }} - "--timeout_seconds" - {{ .Values.coordinator.timeout_seconds | quote }} @@ -133,6 +126,17 @@ spec: - "False" - "--k8s_delete_namespace" - "False" + - "--k8s_with_analytical" + - {{ .Values.engines.analytical.enabled | quote }} + - "--k8s_with_analytical_java" + - {{ .Values.engines.analytical_java.enabled | quote }} + - "--k8s_with_interactive" + - {{ .Values.engines.interactive.enabled | quote }} + - "--k8s_with_learning" + - {{ .Values.engines.learning.enabled | quote }} + - "--k8s_with_dataset" + - {{ .Values.engines.dataset.enabled | quote }} + {{- if .Values.withJupyter }} - name: jupyter {{- if .Values.jupyter.image.tag }} diff --git a/charts/graphscope/templates/role_and_binding.yaml b/charts/graphscope/templates/role_and_binding.yaml index c0a5f4d1596b..da40e23288a8 100644 --- a/charts/graphscope/templates/role_and_binding.yaml +++ b/charts/graphscope/templates/role_and_binding.yaml @@ -7,7 +7,7 @@ metadata: - kubernetes rules: - apiGroups: ["apps", "extensions", ""] - resources: ["configmaps", "deployments", "deployments/status", "endpoints", "events", "pods", "pods/log", "pods/exec", "pods/status", "services", "replicasets"] + resources: ["configmaps", "deployments", "deployments/status", "statefulsets", "statefulsets/status", "endpoints", "events", "pods", "pods/log", "pods/exec", "pods/status", "services", "replicasets"] verbs: ["*"] - apiGroups: ["rbac.authorization.k8s.io"] resources: ["roles", "rolebindings"] diff --git a/charts/graphscope/values.yaml b/charts/graphscope/values.yaml index 688d08692777..03bdb0f19333 100644 --- a/charts/graphscope/values.yaml +++ b/charts/graphscope/values.yaml @@ -2,15 +2,25 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. +image: + registry: registry.cn-hongkong.aliyuncs.com + repository: graphscope + # Overrides the image tag whose default is the chart appVersion. + tag: "" + ## Specify a imagePullPolicy + ## Defaults to 'Always' if image tag is 'latest', else set to 'IfNotPresent' + ## ref: http://kubernetes.io/docs/user-guide/images/#pre-pulling-images + ## + pullPolicy: IfNotPresent + ## Optionally specify an array of imagePullSecrets (secrets must be manually created in the namespace) + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ + ## Example: + ## pullSecrets: + ## - myRegistryKeySecretName + ## + pullSecrets: [] + -imagePullPolicy: Always -## Optionally specify an array of imagePullSecrets. -## Secrets must be manually created in the namespace. -## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ -## -# imagePullSecrets: -# - RegistryKeySecretName -# ## Specify an array of volumes by setting enabled true. ## Here is an example to mount hostpath `/testingdata` to `/tmp/testingdata` in pod. volumes: @@ -30,23 +40,21 @@ coordinator: service: type: NodePort image: - name: registry.cn-hongkong.aliyuncs.com/graphscope/graphscope - # Overrides the image tag whose default is the chart appVersion. - tag: "" + name: coordinator resources: requests: - cpu: 4.0 - memory: 4Gi + cpu: 0.2 + memory: 256Mi limits: - cpu: 4.0 - memory: 4Gi + cpu: 1 + memory: 512Mi extraEnv: {} readinessProbe: enabled: true - initialDelaySeconds: 10 - periodSeconds: 10 - timeoutSeconds: 15 - failureThreshold: 8 + initialDelaySeconds: 0 + periodSeconds: 1 + timeoutSeconds: 5 + failureThreshold: 20 successThreshold: 1 # Waiting GraphScope instance ready until reached timeout. timeout_seconds: 1200 @@ -56,18 +64,35 @@ engines: num_workers: 2 # Available options of log_level: INFO, DEBUG log_level: INFO - image: - name: registry.cn-hongkong.aliyuncs.com/graphscope/graphscope - # Overrides the image tag whose default is the chart appVersion. - tag: "" + + analytical: + enabled: True + image: + name: analytical + analytical_java: + enabled: False + image: + name: analytical-java + interactive: + enabled: True + image: + name: interactive + learning: + enabled: True + image: + name: learning + dataset: + enabled: False + image: + name: dataset + resources: requests: - cpu: 2.0 - memory: 4Gi + cpu: 1 + memory: 1Gi limits: cpu: 2.0 - memory: 4Gi - + memory: 2Gi vineyard: # When `vineyard.daemonset` is set to the Helm release name, the coordinator will @@ -77,9 +102,9 @@ vineyard: # The vineyard IPC socket is placed on host at /var/run/vineyard-{namespace}-{release}. daemonset: "" image: - name: registry.cn-hongkong.aliyuncs.com/graphscope/graphscope + name: ghcr.io/v6d-io/v6d/vineyardd # Overrides the image tag whose default is the chart appVersion. - tag: "" + tag: v0.11.2 resources: requests: cpu: 0.5 @@ -91,19 +116,6 @@ vineyard: shared_mem: 8Gi -etcd: - image: - name: quay.io/coreos/etcd - # Overrides the image tag whose default is the chart appVersion. - tag: v3.4.13 - resources: - requests: - cpu: 0.5 - memory: 128Mi - limits: - cpu: 0.5 - memory: 128Mi - withJupyter: true jupyter: @@ -112,9 +124,9 @@ jupyter: tag: "" resources: requests: - cpu: 1.0 - memory: 4Gi + cpu: 0.5 + memory: 256Mi limits: cpu: 1.0 - memory: 4Gi + memory: 512Mi port: 30080 diff --git a/coordinator/gscoordinator/cluster_builder.py b/coordinator/gscoordinator/cluster_builder.py new file mode 100644 index 000000000000..16208b86e194 --- /dev/null +++ b/coordinator/gscoordinator/cluster_builder.py @@ -0,0 +1,629 @@ +import base64 +import json +import logging +import os + +try: + from kubernetes import client as kube_client + from kubernetes import config as kube_config + from kubernetes import watch as kube_watch + from kubernetes.client import AppsV1Api + from kubernetes.client import CoreV1Api + from kubernetes.client.rest import ApiException as K8SApiException + from kubernetes.config import ConfigException as K8SConfigException +except ImportError: + kube_client = None + kube_config = None + kube_watch = None + AppsV1Api = None + CoreV1Api = None + K8SApiException = None + K8SConfigException = None + +from pprint import pprint + +from graphscope.deploy.kubernetes.resource_builder import ResourceBuilder +from graphscope.deploy.kubernetes.utils import get_service_endpoints + +from gscoordinator.version import __version__ + +logger = logging.getLogger("graphscope") + + +BASE_MACHINE_ENVS = { + "MY_NODE_NAME": "spec.nodeName", + "MY_POD_NAME": "metadata.name", + "MY_POD_NAMESPACE": "metadata.namespace", + "MY_POD_IP": "status.podIP", + "MY_HOST_NAME": "status.podIP", +} + + +_annotations = { + "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-health-check-type": "tcp", + "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-health-check-connect-timeout": "8", + "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-healthy-threshold": "2", + "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-unhealthy-threshold": "2", + "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-health-check-interval": "1", +} + + +class EngineCluster: + def __init__( + self, + engine_cpu, + engine_mem, + engine_pod_node_selector, + glog_level, + image_pull_policy, + image_pull_secrets, + image_registry, + image_repository, + image_tag, + instance_id, + namespace, + num_workers, + preemptive, + service_type, + vineyard_cpu, + vineyard_daemonset, + vineyard_image, + vineyard_mem, + vineyard_shared_mem, + volumes, + with_analytical, + with_analytical_java, + with_dataset, + with_interactive, + with_learning, + with_mars, + ): + self._gs_prefix = "gs-engine-" + self._analytical_prefix = "gs-analytical-" + self._interactive_frontend_prefix = "gs-interactive-frontend-" + + self._learning_prefix = "gs-learning-" + self._learning_service_name_prefix = "gs-graphlearn-service-" + + self._vineyard_prefix = "vineyard-" + self._vineyard_service_name_prefix = "gs-vineyard-service-" + + self._mars_scheduler_name_prefix = "mars-scheduler-" + self._mars_service_name_prefix = "mars-" + + self._instance_id = instance_id + + self._namespace = namespace + self._engine_labels = { + "app.kubernetes.io/name": "graphscope", + "app.kubernetes.io/instance": self._instance_id, + "app.kubernetes.io/version": __version__, + "app.kubernetes.io/component": "engine", + } + self._frontend_labels = self._engine_labels.copy() + self._frontend_labels["app.kubernetes.io/component"] = "frontend" + + self._with_dataset = with_dataset + if not image_registry: + image_prefix = image_repository + else: + image_prefix = f"{image_registry}/{image_repository}" + self._analytical_image = f"{image_prefix}/analytical:{image_tag}" + self._analytical_java_image = f"{image_prefix}/analytical-java:{image_tag}" + self._interactive_frontend_image = ( + f"{image_prefix}/interactive-frontend:{image_tag}" + ) + self._interactive_executor_image = ( + f"{image_prefix}/interactive-executor:{image_tag}" + ) + self._learning_image = f"{image_prefix}/learning:{image_tag}" + self._dataset_image = f"{image_prefix}/dataset:{image_tag}" + + self._vineyard_image = vineyard_image + + self._image_pull_policy = image_pull_policy + self._image_pull_secrets = image_pull_secrets + + self._vineyard_daemonset = vineyard_daemonset + + if with_analytical and with_analytical_java: + logger.warning( + "Cannot setup `with_analytical` and `with_analytical_java` at the same time" + ) + logger.warning("Disabled `analytical`.") + self._with_analytical = False + + self._with_analytical = with_analytical + self._with_analytical_java = with_analytical_java + self._with_interactive = with_interactive + self._with_learning = with_learning + self._with_mars = with_mars + + self._glog_level = glog_level + self._preemptive = preemptive + self._vineyard_shared_mem = vineyard_shared_mem + + self._node_selector = ( + json.loads(self.base64_decode(engine_pod_node_selector)) + if engine_pod_node_selector + else None + ) + self._num_workers = num_workers + self._volumes = json.loads(self.base64_decode(volumes)) if volumes else None + + self._sock = "/tmp/vineyard_workspace/vineyard.sock" + + self._vineyard_requests = {"cpu": vineyard_cpu, "memory": vineyard_mem} + self._analytical_requests = {"cpu": engine_cpu, "memory": engine_mem} + self._executor_requests = {"cpu": "2000m", "memory": engine_mem} + self._learning_requests = {"cpu": "1000m", "memory": "256Mi"} + self._frontend_requests = {"cpu": "200m", "memory": "512Mi"} + self._dataset_requests = {"cpu": "200m", "memory": "64Mi"} + + self._service_type = service_type + self._vineyard_service_port = 9600 # fixed + self._etcd_port = 2379 + + # This must be same with v6d:modules/io/python/drivers/io/kube_ssh.sh + self.analytical_container_name = "engine" + self.interactive_frontend_container_name = "frontend" + self.interactive_executor_container_name = "executor" + self.learning_container_name = "learning" + self.dataset_container_name = "dataset" + self.mars_container_name = "mars" + self.vineyard_container_name = "vineyard" + + @property + def vineyard_ipc_socket(self): + return self._sock + + def base64_decode(self, string): + return base64.b64decode(string).decode("utf-8") + + def get_common_env(self): + def put_if_exists(env: dict, key: str): + if key in os.environ: + env[key] = os.environ[key] + + env = { + "GLOG_v": str(self._glog_level), + "VINEYARD_IPC_SOCKET": self.vineyard_ipc_socket, + "WITH_VINEYARD": "ON", + } + put_if_exists(env, "OPAL_PREFIX") + put_if_exists(env, "OPAL_BINDIR") + env = [kube_client.V1EnvVar(name=k, value=v) for k, v in env.items()] + return env + + def get_base_machine_env(self): + env = [ + ResourceBuilder.get_value_from_field_ref(key, value) + for key, value in BASE_MACHINE_ENVS.items() + ] + return env + + def get_vineyard_socket_volume(self): + name = "vineyard-ipc-socket" + volume = kube_client.V1Volume(name=name) + if self._vineyard_daemonset is None: + empty_dir = kube_client.V1EmptyDirVolumeSource() + volume.empty_dir = empty_dir + else: + path = f"/var/run/vineyard-{self._namespace}-{self._vineyard_daemonset}" + host_path = kube_client.V1HostPathVolumeSource(path=path) + host_path.type = "Directory" + volume.host_path = host_path + + source_volume_mount = kube_client.V1VolumeMount( + name=name, mount_path="/tmp/vineyard_workspace" + ) + destination_volume_mount = source_volume_mount + + return volume, source_volume_mount, destination_volume_mount + + def get_shm_volume(self): + name = "host-shm" + volume = kube_client.V1Volume(name=name) + volume.empty_dir = kube_client.V1EmptyDirVolumeSource() + volume.empty_dir.medium = "Memory" + + source_volume_mount = kube_client.V1VolumeMount( + name=name, mount_path="/dev/shm" + ) + destination_volume_mount = source_volume_mount + + return volume, source_volume_mount, destination_volume_mount + + def get_dataset_volume(self): + name = "dataset" + volume = kube_client.V1Volume(name=name) + volume.empty_dir = kube_client.V1EmptyDirVolumeSource() + + source_volume_mount = kube_client.V1VolumeMount( + name=name, mount_path="/dataset" + ) + source_volume_mount.mount_propagation = "Bidirectional" + + # volume mount in engine container + destination_volume_mount = kube_client.V1VolumeMount( + name=name, mount_path="/dataset" + ) + destination_volume_mount.read_only = True + destination_volume_mount.mount_propagation = "HostToContainer" + + return volume, source_volume_mount, destination_volume_mount + + def get_engine_container_helper( + self, name, image, args, volume_mounts, requests, limits + ): + container = kube_client.V1Container( + name=name, image=image, args=args, volume_mounts=volume_mounts + ) + container.image_pull_policy = self._image_pull_policy + # container.env = self.get_common_env() + self.get_base_machine_env() + container.env = self.get_common_env() + container.resources = ResourceBuilder.get_resources( + requests, None, self._preemptive + ) + return container + + def get_analytical_container(self, volume_mounts, with_java=False): + name = self.analytical_container_name + image = self._analytical_image if not with_java else self._analytical_java_image + args = ["tail", "-f", "/dev/null"] + container = self.get_engine_container_helper( + name, + image, + args, + volume_mounts, + self._analytical_requests, + self._analytical_requests, + ) + + readiness_probe = kube_client.V1Probe() + command = ["/bin/bash", "-c", f"ls {self._sock} 2>/dev/null"] + readiness_probe._exec = kube_client.V1ExecAction(command=command) + readiness_probe.initial_delay_seconds = 5 + readiness_probe.period_seconds = 2 + readiness_probe.failure_threshold = 3 + container.readiness_probe = readiness_probe + + # container.lifecycle = self.get_lifecycle() + return container + + def get_interactive_executor_container(self, volume_mounts): + name = self.interactive_executor_container_name + image = self._interactive_executor_image + args = ["tail", "-f", "/dev/null"] + container = self.get_engine_container_helper( + name, + image, + args, + volume_mounts, + self._executor_requests, + self._executor_requests, + ) + return container + + def get_learning_container(self, volume_mounts): + name = self.learning_container_name + image = self._learning_image + args = ["tail", "-f", "/dev/null"] + container = self.get_engine_container_helper( + name, + image, + args, + volume_mounts, + self._learning_requests, + self._learning_requests, + ) + return container + + def get_vineyard_container(self, volume_mounts): + name = self.vineyard_container_name + image = self._vineyard_image + sts_name = self.engine_stateful_set_name + svc_name = sts_name + "-headless" + pod0_dns = f"{sts_name}-0.{svc_name}.{self._namespace}.svc.cluster.local" + vineyard_cmd = ( + f"vineyardd -size {self._vineyard_shared_mem} -socket {self._sock}" + ) + args = f""" + [[ `hostname` =~ -([0-9]+)$ ]] || exit 1; + ordinal=${{BASH_REMATCH[1]}}; + if (( $ordinal == 0 )); then + {vineyard_cmd} -etcd_endpoint http://0.0.0.0:{self._etcd_port} + else + until nslookup {pod0_dns}; do sleep 1; done; + {vineyard_cmd} -etcd_endpoint http://{pod0_dns}:{self._etcd_port} + fi; + """ + args = ["bash", "-c", args] + container = self.get_engine_container_helper( + name, + image, + args, + volume_mounts, + self._vineyard_requests, + self._vineyard_requests, + ) + container.ports = [ + kube_client.V1ContainerPort(container_port=self._vineyard_service_port), + kube_client.V1ContainerPort(container_port=self._etcd_port), + ] + return container + + def get_mars_container(self): + _ = self.mars_container_name + return + + def get_dataset_container(self, volume_mounts): + name = self.dataset_container_name + container = kube_client.V1Container(name=name) + container.image = self._dataset_image + container.image_pull_policy = self._image_pull_policy + + container.resources = ResourceBuilder.get_resources( + self._dataset_requests, self._dataset_requests + ) + + container.volume_mounts = volume_mounts + + container.security_context = kube_client.V1SecurityContext(privileged=True) + return container + + def get_engine_pod_spec(self): + containers = [] + volumes = [] + + socket_volume = self.get_vineyard_socket_volume() + shm_volume = self.get_shm_volume() + + volumes.extend([socket_volume[0], shm_volume[0]]) + if self._vineyard_daemonset is None: + containers.append( + self.get_vineyard_container( + volume_mounts=[socket_volume[1], shm_volume[1]] + ) + ) + + engine_volume_mounts = [socket_volume[2], shm_volume[2]] + + if self._volumes and self._volumes is not None: + udf_volumes = ResourceBuilder.get_user_defined_volumes(self._volumes) + volumes.extend(udf_volumes[0]) + engine_volume_mounts.extend(udf_volumes[2]) + + if self._with_dataset: + dataset_volume = self.get_dataset_volume() + volumes.append(dataset_volume[0]) + containers.append( + self.get_dataset_container(volume_mounts=[dataset_volume[1]]) + ) + engine_volume_mounts.append(dataset_volume[2]) + if self._with_analytical: + containers.append( + self.get_analytical_container(volume_mounts=engine_volume_mounts) + ) + if self._with_analytical_java: + containers.append( + self.get_analytical_container( + volume_mounts=engine_volume_mounts, with_java=True + ) + ) + if self._with_interactive: + containers.append( + self.get_interactive_executor_container( + volume_mounts=engine_volume_mounts + ) + ) + if self._with_learning: + containers.append( + self.get_learning_container(volume_mounts=engine_volume_mounts) + ) + if self._with_mars: + containers.append(self.get_mars_container()) + return ResourceBuilder.get_pod_spec( + containers=containers, + image_pull_secrets=self._image_pull_secrets, + node_selector=self._node_selector, + volumes=volumes, + ) + + def get_engine_pod_template_spec(self): + spec = self.get_engine_pod_spec() + return ResourceBuilder.get_pod_template_spec(spec, self._engine_labels) + + def get_engine_stateful_set(self): + name = self.engine_stateful_set_name + template = self.get_engine_pod_template_spec() + replicas = self._num_workers + service_name = name + "-headless" + spec = ResourceBuilder.get_stateful_set_spec( + template, replicas, self._engine_labels, service_name + ) + return ResourceBuilder.get_stateful_set( + self._namespace, name, spec, self._engine_labels + ) + + def get_engine_headless_service(self): + name = self.engine_stateful_set_name + "-headless" + ports = [kube_client.V1ServicePort(name="etcd", port=self._etcd_port)] + service_spec = ResourceBuilder.get_service_spec( + "ClusterIP", ports, self._engine_labels, None + ) + # Necessary, create a headless service for statefulset + service_spec.cluster_ip = "None" + service = ResourceBuilder.get_service( + self._namespace, name, service_spec, self._engine_labels + ) + return service + + def get_vineyard_service(self): + service_type = self._service_type + name = f"{self._vineyard_prefix}{self._instance_id}" + ports = [kube_client.V1ServicePort(name=name, port=self._vineyard_service_port)] + service_spec = ResourceBuilder.get_service_spec( + service_type, ports, self._engine_labels, None + ) + service = ResourceBuilder.get_service( + self._namespace, name, service_spec, self._engine_labels + ) + return service + + def get_learning_service(self, object_id, start_port): + service_type = self._service_type + num_workers = self._num_workers + name = f"{self._learning_prefix}{object_id}" + ports = [] + for i in range(start_port, start_port + num_workers): + port = kube_client.V1ServicePort(name=f"{name}-{i}", port=i, protocol="TCP") + ports.append(port) + service_spec = ResourceBuilder.get_service_spec( + service_type, ports, self._engine_labels, "Local" + ) + service = ResourceBuilder.get_service( + self._namespace, name, service_spec, self._engine_labels + ) + return service + + def get_learning_ports(self, start_port): + num_workers = self._num_workers + return [i for i in range(start_port, start_port + num_workers)] + + @property + def engine_stateful_set_name(self): + return f"{self._gs_prefix}{self._instance_id}" + + @property + def frontend_deployment_name(self): + return f"{self._interactive_frontend_prefix}{self._instance_id}" + + @property + def vineyard_service_name(self): + return f"{self._vineyard_prefix}{self._instance_id}" + + def get_vineyard_service_endpoint(self, api_client): + # return f"{self.vineyard_service_name}:{self._vineyard_service_port}" + service_type = self._service_type + service_name = self.vineyard_service_name + endpoints = get_service_endpoints( + api_client=api_client, + namespace=self._namespace, + name=service_name, + service_type=service_type, + ) + assert len(endpoints) > 0 + return endpoints[0] + + def get_learning_service_name(self, object_id): + return f"{self._learning_service_name_prefix}{object_id}" + + def get_graphlearn_service_endpoint(self, api_client, object_id, pod_host_ip_list): + service_name = self.get_learning_service_name(object_id) + service_type = self._service_type + core_api = kube_client.CoreV1Api(api_client) + if service_type == "NodePort": + # TODO: add label_selector to filter the service + services = core_api.list_namespaced_service(self._namespace) + for svc in services.items: + if svc.metadata.name == service_name: + endpoints = [] + for ip, port_spec in zip(pod_host_ip_list, svc.spec.ports): + endpoints.append( + ( + f"{ip}:{port_spec.node_port}", + int(port_spec.name.split("-")[-1]), + ) + ) + endpoints.sort(key=lambda ep: ep[1]) + return [ep[0] for ep in endpoints] + elif service_type == "LoadBalancer": + endpoints = get_service_endpoints( + api_client=api_client, + namespace=self._namespace, + name=service_name, + service_type=service_type, + ) + return endpoints + raise RuntimeError("Get graphlearn service endpoint failed.") + + def get_interactive_frontend_container(self): + name = self.interactive_frontend_container_name + image = self._interactive_frontend_image + args = ["tail", "-f", "/dev/null"] + container = kube_client.V1Container(name=name, image=image, args=args) + container.image_pull_policy = self._image_pull_policy + container.resources = ResourceBuilder.get_resources( + self._frontend_requests, None + ) + return container + + def get_interactive_frontend_deployment(self, replicas=1): + name = self.frontend_deployment_name + container = self.get_interactive_frontend_container() + pod_spec = ResourceBuilder.get_pod_spec(containers=[container]) + template_spec = ResourceBuilder.get_pod_template_spec( + pod_spec, self._frontend_labels + ) + deployment_spec = ResourceBuilder.get_deployment_spec( + template_spec, replicas, self._frontend_labels + ) + return ResourceBuilder.get_deployment( + self._namespace, name, deployment_spec, self._frontend_labels + ) + + def get_interactive_frontend_service(self, port): + name = self.frontend_deployment_name + service_type = self._service_type + ports = [kube_client.V1ServicePort(name="gremlin", port=port)] + service_spec = ResourceBuilder.get_service_spec( + service_type, ports, self._frontend_labels, None + ) + service = ResourceBuilder.get_service( + self._namespace, name, service_spec, self._frontend_labels, _annotations + ) + return service + + +class MarsCluster: + def __init__(self, instance_id, namespace, service_type): + self._mars_prefix = "mars-" + self._mars_scheduler_port = 7103 # fixed + self._mars_scheduler_web_port = 7104 # fixed + self._mars_worker_port = 7105 # fixed + self._instance_id = instance_id + self._namespace = namespace + self._service_type = service_type + + self._mars_worker_requests = {"cpu": "200m", "memory": "512Mi"} + self._mars_scheduler_requests = {"cpu": "200m", "memory": "512Mi"} + + def get_mars_deployment(self): + pass + + def get_mars_service(self): + pass + + @property + def mars_scheduler_service_name(self): + return f"{self._mars_prefix}{self._instance_id}" + + @property + def mars_scheduler_web_port(self): + return self._mars_scheduler_web_port + + def get_mars_service_endpoint(self, api_client): + # Always len(endpoints) >= 1 + service_name = self.mars_scheduler_service_name + service_type = self._service_type + web_port = self.mars_scheduler_web_port + endpoints = get_service_endpoints( + api_client=api_client, + namespace=self._namespace, + name=service_name, + service_type=service_type, + query_port=web_port, + ) + assert len(endpoints) > 0 + return f"http://{endpoints[0]}" diff --git a/coordinator/gscoordinator/coordinator.py b/coordinator/gscoordinator/coordinator.py index 3ac80a0187a5..a672a345a814 100644 --- a/coordinator/gscoordinator/coordinator.py +++ b/coordinator/gscoordinator/coordinator.py @@ -184,7 +184,7 @@ def __del__(self): @Monitor.connectSession def ConnectSession(self, request, context): - if self._launcher.analytical_engine_endpoint is not None: + if self._launcher.analytical_engine_process is not None: engine_config = self._operation_executor.get_analytical_engine_config() engine_config.update(self._launcher.get_engine_config()) host_names = self._launcher.hosts.split(",") @@ -222,14 +222,6 @@ def ConnectSession(self, request, context): # Connect to serving coordinator. self._connected = True - if self._session_id is None: # else reuse previous session. - self._session_id = self._generate_session_id() - self._launcher.set_session_workspace(self._session_id) - - self._operation_executor = OperationExecutor( - self._session_id, self._launcher, self._object_manager - ) - # Cleanup after timeout seconds self._dangling_timeout_seconds = request.dangling_timeout_seconds # If true, also delete graphscope instance (such as pods) in closing process @@ -238,6 +230,20 @@ def ConnectSession(self, request, context): # Session connected, fetch logs via gRPC. self._streaming_logs = True sys.stdout.drop(False) + + if self._session_id is None: # else reuse previous session. + self._session_id = self._generate_session_id() + self._launcher.set_session_workspace(self._session_id) + + self._operation_executor = OperationExecutor( + self._session_id, self._launcher, self._object_manager + ) + if not self._launcher.start(): + # connect failed, more than one connection at the same time. + context.set_code(grpc.StatusCode.ABORTED) + context.set_details("Create GraphScope cluster failed") + return message_pb2.ConnectSessionResponse() + return message_pb2.ConnectSessionResponse( session_id=self._session_id, cluster_type=self._launcher.type(), @@ -382,12 +388,16 @@ def AddLib(self, request, context): return message_pb2.AddLibResponse() def CreateAnalyticalInstance(self, request, context): + engine_config = {} try: - self._launcher.start() # create GAE rpc service self._launcher.create_analytical_instance() engine_config = self._operation_executor.get_analytical_engine_config() engine_config.update(self._launcher.get_engine_config()) + except NotImplementedError: + # TODO: This is a workaround for that we launching gae unconditionally after session connects, + # make it an error when above logic has been changed. + logger.warning("Analytical engine is not enabled.") except grpc.RpcError as e: context.set_code(e.code()) context.set_details("Get engine config failed: " + e.details()) @@ -500,7 +510,11 @@ def CloseLearningInstance(self, request, context): @Monitor.cleanup def cleanup(self, cleanup_instance=True, is_dangling=False): # clean up session resources. - logger.info("Cleaning up resources in coordinator") + logger.info( + "Clean up resources, cleanup_instance: %s, is_dangling: %s", + cleanup_instance, + is_dangling, + ) for _, obj in self._object_manager.items(): op_type, config = None, {} if obj.type == "app": @@ -577,7 +591,6 @@ def parse_sys_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument( "--num_workers", type=int, @@ -634,16 +647,22 @@ def parse_sys_args(): help="The namespace to create all resource, which must exist in advance.", ) parser.add_argument( - "--k8s_service_type", + "--k8s_image_registry", type=str, default="", help="k8s image registry" + ) + parser.add_argument( + "--k8s_image_repository", type=str, - default="NodePort", - help="Service type, choose from 'NodePort' or 'LoadBalancer'.", + default="graphscope", + help="k8s image repository", ) parser.add_argument( - "--k8s_gs_image", + "--k8s_image_tag", type=str, default=__version__, help="k8s image tag" + ) + parser.add_argument( + "--k8s_service_type", type=str, - default=f"registry.cn-hongkong.aliyuncs.com/graphscope/graphscope:{__version__}", - help="Docker image of graphscope engines.", + default="NodePort", + help="Service type, choose from 'NodePort' or 'LoadBalancer'.", ) parser.add_argument( "--k8s_coordinator_name", @@ -657,12 +676,6 @@ def parse_sys_args(): default="", help="Coordinator service name of graphscope instance.", ) - parser.add_argument( - "--k8s_etcd_image", - type=str, - default="registry.cn-hongkong.aliyuncs.com/graphscope/etcd:v3.4.13", - help="Docker image of etcd, needed by vineyard.", - ) parser.add_argument( "--k8s_image_pull_policy", type=str, @@ -672,13 +685,13 @@ def parse_sys_args(): parser.add_argument( "--k8s_image_pull_secrets", type=str, - default="graphscope", + default="", help="A list of comma separated secrets to pull image.", ) parser.add_argument( "--k8s_vineyard_daemonset", type=str, - default="", + default=None, help="Use the existing vineyard DaemonSet with name 'k8s_vineyard_daemonset'.", ) parser.add_argument( @@ -687,6 +700,12 @@ def parse_sys_args(): default=1.0, help="CPU cores of vineyard container.", ) + parser.add_argument( + "--k8s_vineyard_image", + type=str, + default=None, + help="Image for vineyard container", + ) parser.add_argument( "--k8s_vineyard_mem", type=str, @@ -730,22 +749,36 @@ def parse_sys_args(): help="The port that etcd server will beind to for accepting peer connections. Defaults to 2380.", ) parser.add_argument( - "--k8s_etcd_num_pods", - type=int, - default=3, - help="The number of etcd pods.", + "--k8s_with_analytical", + type=str2bool, + nargs="?", + const=True, + default=True, + help="Enable analytical engine or not.", ) parser.add_argument( - "--k8s_etcd_cpu", - type=float, - default=1.0, - help="CPU cores of etcd pod, default: 1.0", + "--k8s_with_analytical_java", + type=str2bool, + nargs="?", + const=True, + default=True, + help="Enable analytical engine with java or not.", ) parser.add_argument( - "--k8s_etcd_mem", - type=str, - default="256Mi", - help="Memory of etcd pod, suffix with ['Mi', 'Gi', 'Ti'].", + "--k8s_with_interactive", + type=str2bool, + nargs="?", + const=True, + default=True, + help="Enable interactive engine or not.", + ) + parser.add_argument( + "--k8s_with_learning", + type=str2bool, + nargs="?", + const=True, + default=True, + help="Enable learning engine or not.", ) parser.add_argument( "--k8s_with_mars", @@ -779,12 +812,6 @@ def parse_sys_args(): default="2Gi", help="Memory of Mars scheduler container, default: 2Gi", ) - parser.add_argument( - "--k8s_etcd_pod_node_selector", - type=str, - default="", - help="Node selector for etcd pods, default is None", - ) parser.add_argument( "--k8s_engine_pod_node_selector", type=str, @@ -794,9 +821,17 @@ def parse_sys_args(): parser.add_argument( "--k8s_volumes", type=str, - default="{}", + default="", help="A json string specifies the kubernetes volumes to mount.", ) + parser.add_argument( + "--k8s_delete_namespace", + type=str2bool, + nargs="?", + const=True, + default=False, + help="Delete the namespace that created by graphscope.", + ) parser.add_argument( "--timeout_seconds", type=int, @@ -818,25 +853,13 @@ def parse_sys_args(): help="Wait until the graphscope instance has been deleted successfully", ) parser.add_argument( - "--k8s_delete_namespace", + "--k8s_with_dataset", type=str2bool, nargs="?", - const=True, + const=False, default=False, - help="Delete the namespace that created by graphscope.", - ) - parser.add_argument( - "--mount_dataset", - type=str, - default=None, help="Mount the aliyun dataset bucket as a volume by ossfs.", ) - parser.add_argument( - "--k8s_dataset_image", - type=str, - default="registry.cn-hongkong.aliyuncs.com/graphscope/dataset:{__version__}", - help="Docker image to mount the dataset bucket", - ) parser.add_argument( "--monitor", type=str2bool, @@ -856,7 +879,6 @@ def parse_sys_args(): def launch_graphscope(): args = parse_sys_args() - logger.info("Launching with args %s", args) launcher = get_launcher(args) start_server(launcher, args) @@ -864,43 +886,41 @@ def launch_graphscope(): def get_launcher(args): if args.cluster_type == "k8s": launcher = KubernetesClusterLauncher( - namespace=args.k8s_namespace, - service_type=args.k8s_service_type, - gs_image=args.k8s_gs_image, - etcd_image=args.k8s_etcd_image, - dataset_image=args.k8s_dataset_image, coordinator_name=args.k8s_coordinator_name, coordinator_service_name=args.k8s_coordinator_service_name, - etcd_addrs=args.etcd_addrs, - etcd_listening_client_port=args.etcd_listening_client_port, - etcd_listening_peer_port=args.etcd_listening_peer_port, - etcd_num_pods=args.k8s_etcd_num_pods, - etcd_cpu=args.k8s_etcd_cpu, - etcd_mem=args.k8s_etcd_mem, + delete_namespace=args.k8s_delete_namespace, engine_cpu=args.k8s_engine_cpu, engine_mem=args.k8s_engine_mem, - vineyard_daemonset=args.k8s_vineyard_daemonset, - vineyard_cpu=args.k8s_vineyard_cpu, - vineyard_mem=args.k8s_vineyard_mem, - vineyard_shared_mem=args.vineyard_shared_mem, + engine_pod_node_selector=args.k8s_engine_pod_node_selector, + image_pull_policy=args.k8s_image_pull_policy, + image_pull_secrets=args.k8s_image_pull_secrets, + image_registry=args.k8s_image_registry, + image_repository=args.k8s_image_repository, + image_tag=args.k8s_image_tag, + instance_id=args.instance_id, + log_level=args.log_level, mars_worker_cpu=args.k8s_mars_worker_cpu, mars_worker_mem=args.k8s_mars_worker_mem, mars_scheduler_cpu=args.k8s_mars_scheduler_cpu, mars_scheduler_mem=args.k8s_mars_scheduler_mem, - etcd_pod_node_selector=args.k8s_etcd_pod_node_selector, - engine_pod_node_selector=args.k8s_engine_pod_node_selector, - with_mars=args.k8s_with_mars, - image_pull_policy=args.k8s_image_pull_policy, - image_pull_secrets=args.k8s_image_pull_secrets, - volumes=args.k8s_volumes, - mount_dataset=args.mount_dataset, + with_dataset=args.k8s_with_dataset, + namespace=args.k8s_namespace, num_workers=args.num_workers, preemptive=args.preemptive, - instance_id=args.instance_id, - log_level=args.log_level, + service_type=args.k8s_service_type, timeout_seconds=args.timeout_seconds, + vineyard_cpu=args.k8s_vineyard_cpu, + vineyard_daemonset=args.k8s_vineyard_daemonset, + vineyard_image=args.k8s_vineyard_image, + vineyard_mem=args.k8s_vineyard_mem, + vineyard_shared_mem=args.vineyard_shared_mem, + volumes=args.k8s_volumes, waiting_for_delete=args.waiting_for_delete, - delete_namespace=args.k8s_delete_namespace, + with_mars=args.k8s_with_mars, + with_analytical=args.k8s_with_analytical, + with_analytical_java=args.k8s_with_analytical_java, + with_interactive=args.k8s_with_interactive, + with_learning=args.k8s_with_learning, ) elif args.cluster_type == "hosts": launcher = LocalLauncher( @@ -940,6 +960,9 @@ def start_server(launcher, args): coordinator_service_servicer, server ) server.add_insecure_port(f"0.0.0.0:{args.port}") + + logger.info("Start server with args %s", args) + logger.info("Coordinator server listen at 0.0.0.0:%d", args.port) server.start() diff --git a/coordinator/gscoordinator/kubernetes_launcher.py b/coordinator/gscoordinator/kubernetes_launcher.py index 4a7656ebdf08..2fbb2f6a6469 100644 --- a/coordinator/gscoordinator/kubernetes_launcher.py +++ b/coordinator/gscoordinator/kubernetes_launcher.py @@ -22,12 +22,13 @@ import logging import os import random -import shutil -import socket +import shlex import subprocess import sys import time -import traceback + +from gscoordinator.cluster_builder import EngineCluster +from gscoordinator.cluster_builder import MarsCluster try: from kubernetes import client as kube_client @@ -46,14 +47,8 @@ K8SApiException = None K8SConfigException = None -from graphscope.deploy.kubernetes.resource_builder import GSEngineBuilder -from graphscope.deploy.kubernetes.resource_builder import GSEtcdBuilder -from graphscope.deploy.kubernetes.resource_builder import ServiceBuilder -from graphscope.deploy.kubernetes.resource_builder import VolumeBuilder -from graphscope.deploy.kubernetes.resource_builder import resolve_volume_builder from graphscope.deploy.kubernetes.utils import delete_kubernetes_object from graphscope.deploy.kubernetes.utils import get_kubernetes_object_info -from graphscope.deploy.kubernetes.utils import get_service_endpoints from graphscope.deploy.kubernetes.utils import resolve_api_client from graphscope.framework.utils import PipeWatcher from graphscope.framework.utils import get_tempdir @@ -65,232 +60,174 @@ from gscoordinator.utils import INTERACTIVE_ENGINE_SCRIPT from gscoordinator.utils import WORKSPACE from gscoordinator.utils import ResolveMPICmdPrefix +from gscoordinator.utils import delegate_command_to_pod from gscoordinator.utils import parse_as_glog_level +from gscoordinator.utils import run_command from gscoordinator.version import __version__ logger = logging.getLogger("graphscope") -class ResourceManager(object): - """A class to manager kubernetes object. - - Object managed by this class will dump meta info to disk file - for pod preStop lifecycle management. - - meta info format: - - { - "my-deployment": "Deployment", - "my-service": "Service" - } - """ - - _resource_object_path = os.path.join(get_tempdir(), "resource_object") # fixed - - def __init__(self, api_client): - self._api_client = api_client - self._resource_object = [] - self._meta_info = {} - - def append(self, target): - self._resource_object.append(target) - self._meta_info.update( - get_kubernetes_object_info(api_client=self._api_client, target=target) - ) - self.dump() - - def extend(self, targets): - self._resource_object.extend(targets) - for target in targets: - self._meta_info.update( - get_kubernetes_object_info(api_client=self._api_client, target=target) - ) - self.dump() - - def clear(self): - self._resource_object.clear() - self._meta_info.clear() - - def __str__(self): - return str(self._meta_info) - - def __getitem__(self, index): - return self._resource_object[index] - - def dump(self): - with open(self._resource_object_path, "w") as f: - json.dump(self._meta_info, f) - - def dump_with_extra_resource(self, resource): - """Also dump with extra resources. A typical scenario is - dump meta info of namespace for coordinator dangling processing. - """ - rlt = copy.deepcopy(self._meta_info) - rlt.update(resource) - with open(self._resource_object_path, "w") as f: - json.dump(rlt, f) - - class KubernetesClusterLauncher(AbstractLauncher): - _gs_etcd_builder_cls = GSEtcdBuilder - _gs_engine_builder_cls = GSEngineBuilder - _gs_mars_scheduler_builder_cls = GSEngineBuilder - - _etcd_name_prefix = "gs-etcd-" - _etcd_service_name_prefix = "gs-etcd-service-" - _engine_name_prefix = "gs-engine-" - _vineyard_service_name_prefix = "gs-vineyard-service-" - _gle_service_name_prefix = "gs-graphlearn-service-" - - _vineyard_container_name = "vineyard" # fixed - _etcd_container_name = "etcd" - _engine_container_name = "engine" # fixed - - _mars_scheduler_container_name = "mars" # fixed - _mars_worker_container_name = "mars" # fixed - _mars_scheduler_name_prefix = "marsscheduler-" - _mars_service_name_prefix = "mars-" - - _random_analytical_engine_rpc_port = random.randint(56001, 57000) - - _vineyard_service_port = 9600 # fixed - _mars_scheduler_port = 7103 # fixed - _mars_scheduler_web_port = 7104 # fixed - _mars_worker_port = 7105 # fixed - def __init__( self, - namespace=None, - service_type=None, - gs_image=None, - etcd_image=None, - dataset_image=None, coordinator_name=None, coordinator_service_name=None, - etcd_addrs=None, - etcd_listening_client_port=None, - etcd_listening_peer_port=None, - etcd_num_pods=None, - etcd_cpu=None, - etcd_mem=None, + delete_namespace=None, engine_cpu=None, engine_mem=None, - vineyard_daemonset=None, - vineyard_cpu=None, - vineyard_mem=None, - vineyard_shared_mem=None, - mars_worker_cpu=None, - mars_worker_mem=None, - mars_scheduler_cpu=None, - mars_scheduler_mem=None, - etcd_pod_node_selector=None, engine_pod_node_selector=None, - with_mars=False, image_pull_policy=None, image_pull_secrets=None, - volumes=None, - mount_dataset=None, - num_workers=None, - preemptive=None, + image_registry=None, + image_repository=None, + image_tag=None, instance_id=None, log_level=None, + mars_scheduler_cpu=None, + mars_scheduler_mem=None, + mars_worker_cpu=None, + mars_worker_mem=None, + with_dataset=False, + namespace=None, + num_workers=None, + preemptive=None, + service_type=None, timeout_seconds=None, + vineyard_cpu=None, + vineyard_daemonset=None, + vineyard_image=None, + vineyard_mem=None, + vineyard_shared_mem=None, + volumes=None, waiting_for_delete=None, - delete_namespace=None, + with_mars=False, + with_analytical=True, + with_analytical_java=False, + with_interactive=True, + with_learning=True, **kwargs, ): super().__init__() self._api_client = resolve_api_client() self._core_api = kube_client.CoreV1Api(self._api_client) - self._app_api = kube_client.AppsV1Api(self._api_client) - - self._saved_locals = locals() - self._num_workers = self._saved_locals["num_workers"] - self._instance_id = self._saved_locals["instance_id"] - - # random for multiple k8s cluster in the same namespace - self._engine_name = self._engine_name_prefix + self._saved_locals["instance_id"] - self._etcd_addrs = etcd_addrs - self._etcd_listening_client_port = etcd_listening_client_port - self._etcd_listening_peer_port = etcd_listening_peer_port - self._etcd_name = self._etcd_name_prefix + self._saved_locals["instance_id"] - self._etcd_service_name = ( - self._etcd_service_name_prefix + self._saved_locals["instance_id"] - ) - self._mars_scheduler_name = ( - self._mars_scheduler_name_prefix + self._saved_locals["instance_id"] - ) + self._apps_api = kube_client.AppsV1Api(self._api_client) + self._resource_object = ResourceManager(self._api_client) + + self._instance_id = instance_id + self._namespace = namespace + self._delete_namespace = delete_namespace self._coordinator_name = coordinator_name self._coordinator_service_name = coordinator_service_name - self._resource_object: ResourceManager = ResourceManager(self._api_client) + self._image_registry = image_registry + self._image_repository = image_repository + self._image_tag = image_tag - # etcd pod info - self._etcd_num_pods = max(1, self._saved_locals["etcd_num_pods"]) - self._etcd_endpoint = None + image_pull_secrets = image_pull_secrets.split(",") if image_pull_secrets else [] - # image pull secrets - if image_pull_secrets is not None: - self._image_pull_secrets = image_pull_secrets.split(",") - else: - self._image_pull_secrets = [] + self._glog_level = parse_as_glog_level(log_level) - if volumes: - self._volumes = json.loads(volumes) - else: - self._volumes = dict() - if etcd_pod_node_selector: - self._etcd_pod_node_selector = json.loads(etcd_pod_node_selector) - else: - self._etcd_pod_node_selector = dict() - if engine_pod_node_selector: - self._engine_pod_node_selector = json.loads(engine_pod_node_selector) - else: - self._engine_pod_node_selector = dict() + self._num_workers = num_workers + + self._vineyard_daemonset = vineyard_daemonset + if vineyard_daemonset is not None: + try: + self._apps_api.read_namespaced_daemon_set( + vineyard_daemonset, self._namespace + ) + except K8SApiException: + logger.error(f"Vineyard daemonset {vineyard_daemonset} not found") + self._vineyard_daemonset = None + + self._engine_cpu = engine_cpu + self._engine_mem = engine_mem + self._vineyard_shared_mem = vineyard_shared_mem + + self._with_dataset = with_dataset + self._preemptive = preemptive + self._service_type = service_type + + assert timeout_seconds is not None + self._timeout_seconds = timeout_seconds + + self._waiting_for_delete = waiting_for_delete + + self._with_analytical = with_analytical + self._with_analytical_java = with_analytical_java + self._with_interactive = with_interactive + self._with_learning = with_learning + self._with_mars = with_mars + self._mars_scheduler_cpu = mars_scheduler_cpu + self._mars_scheduler_mem = mars_scheduler_mem + self._mars_worker_cpu = mars_worker_cpu + self._mars_worker_mem = mars_worker_mem - self._host0 = None self._pod_name_list = [] self._pod_ip_list = None self._pod_host_ip_list = None self._analytical_engine_endpoint = None - self._vineyard_service_endpoint = None self._mars_service_endpoint = None - self._closed = False - self._glog_level = parse_as_glog_level(log_level) + self._serving = False self._analytical_engine_process = None - + self._random_analytical_engine_rpc_port = random.randint(56001, 57000) # interactive engine # executor inter-processing port # executor rpc port # frontend port self._interactive_port = 8233 - # 8000 ~ 9000 is exposed - self._learning_engine_ports_usage = 8000 - self._graphlearn_services = dict() + self._learning_start_port = 8000 + + self._graphlearn_services = {} self._learning_instance_processes = {} # workspace - instance_id = self._saved_locals["instance_id"] self._instance_workspace = os.path.join(WORKSPACE, instance_id) os.makedirs(self._instance_workspace, exist_ok=True) self._session_workspace = None - # component service name - if self._exists_vineyard_daemonset(self._saved_locals["vineyard_daemonset"]): - self._vineyard_service_name = ( - f"{self._saved_locals['vineyard_daemonset']}-rpc" - ) - else: - self._vineyard_service_name = ( - f"{self._vineyard_service_name_prefix}{instance_id}" + self._engine_cluster = EngineCluster( + engine_cpu=engine_cpu, + engine_mem=engine_mem, + engine_pod_node_selector=engine_pod_node_selector, + glog_level=self._glog_level, + image_pull_policy=image_pull_policy, + image_pull_secrets=image_pull_secrets, + image_registry=image_registry, + image_repository=image_repository, + image_tag=image_tag, + instance_id=instance_id, + with_dataset=with_dataset, + namespace=namespace, + num_workers=num_workers, + preemptive=preemptive, + service_type=service_type, + vineyard_cpu=vineyard_cpu, + vineyard_daemonset=vineyard_daemonset, + vineyard_image=vineyard_image, + vineyard_mem=vineyard_mem, + vineyard_shared_mem=vineyard_shared_mem, + volumes=volumes, + with_mars=with_mars, + with_analytical=with_analytical, + with_analytical_java=with_analytical_java, + with_interactive=with_interactive, + with_learning=with_learning, + ) + + self._vineyard_service_endpoint = None + self.vineyard_internal_service_endpoint = None + self._mars_service_endpoint = None + if self._with_mars: + self._mars_cluster = MarsCluster( + self._instance_id, self._namespace, self._service_type ) - self._mars_service_name = f"{self._mars_service_name_prefix}{instance_id}" def __del__(self): self.stop() @@ -298,90 +235,83 @@ def __del__(self): def type(self): return types_pb2.K8S - def get_vineyard_service_name(self): - return self._vineyard_service_name - - def get_vineyard_rpc_endpoint(self): - return self._vineyard_service_endpoint - - def get_mars_scheduler_endpoint(self): - return self._mars_service_endpoint - def waiting_for_delete(self): - return self._saved_locals["waiting_for_delete"] + return self._waiting_for_delete def get_namespace(self): - return self._saved_locals["namespace"] + return self._namespace def get_vineyard_stream_info(self): - hosts = [ - "%s:%s" % (self._saved_locals["namespace"], host) - for host in self._pod_name_list - ] + hosts = [f"{self._namespace}:{host}" for host in self._pod_name_list] return "kubernetes", hosts def set_session_workspace(self, session_id): self._session_workspace = os.path.join(self._instance_workspace, session_id) os.makedirs(self._session_workspace, exist_ok=True) + def launch_etcd(self): + pass + + def configure_etcd_endpoint(self): + pass + @property def preemptive(self): - return self._saved_locals["preemptive"] + return self._preemptive @property def hosts(self): """String of a list of pod name, comma separated.""" return ",".join(self._pod_name_list) + @property + def hosts_list(self): + return self._pod_name_list + def distribute_file(self, path): - d = os.path.dirname(path) for pod in self._pod_name_list: - subprocess.check_call( - [ - shutil.which("kubectl"), - "exec", - pod, - "-c", - "engine", - "--", - "mkdir", - "-p", - d, - ] - ) - subprocess.check_call( - [ - shutil.which("kubectl"), - "cp", - path, - "{}:{}".format(pod, path), - "-c", - "engine", - ] - ) + container = self._engine_cluster.analytical_container_name + try: + # The library may exists in the analytical pod. + test_cmd = f"test -f {path}" + logger.debug(delegate_command_to_pod(test_cmd, pod, container)) + logger.info("Library exists, skip distribute") + except RuntimeError: + cmd = f"mkdir -p {os.path.dirname(path)}" + logger.debug(delegate_command_to_pod(cmd, pod, container)) + cmd = f"kubectl cp {path} {pod}:{path} -c {container}" + logger.debug(run_command(cmd)) def close_analytical_instance(self): pass def launch_vineyard(self): """Launch vineyardd in k8s cluster.""" - # TODO: vineyard is launched by engine by now. + # vineyardd is auto launched in vineyardd container + # args = f"vineyardd -size {self._vineyard_shared_mem} \ + # -socket {self._engine_cluster._sock} -etcd_endpoint http://{self._pod_ip_list[0]}:2379" pass def close_etcd(self): - # TODO: Delete etcd pods and service. + # etcd is managed by vineyard pass def close_vineyard(self): + # No need to close vineyardd + # Use delete deployment instead pass def create_interactive_instance(self, object_id: int, schema_path: str): + if not self._with_interactive: + raise NotImplementedError("Interactive engine not enabled") """ Args: - config (dict): dict of op_def_pb2.OpDef.attr + object_id (int): object id of the graph. + schema_path (str): path of the schema file. """ env = os.environ.copy() env["GRAPHSCOPE_HOME"] = GRAPHSCOPE_HOME + container = self._engine_cluster.interactive_executor_container_name cmd = [ INTERACTIVE_ENGINE_SCRIPT, "create_gremlin_instance_on_k8s", @@ -389,7 +319,7 @@ def create_interactive_instance(self, object_id: int, schema_path: str): str(object_id), schema_path, self.hosts, - self._engine_container_name, + container, str(self._interactive_port), # executor port str(self._interactive_port + 1), # executor rpc port str(self._interactive_port + 2), # frontend port @@ -415,13 +345,15 @@ def create_interactive_instance(self, object_id: int, schema_path: str): def close_interactive_instance(self, object_id): env = os.environ.copy() env["GRAPHSCOPE_HOME"] = GRAPHSCOPE_HOME + container = self._engine_cluster.interactive_executor_container_name cmd = [ INTERACTIVE_ENGINE_SCRIPT, "close_gremlin_instance_on_k8s", self._session_workspace, str(object_id), self.hosts, - self._engine_container_name, + container, + self._instance_id, ] logger.info("Close GIE instance with command: %s", " ".join(cmd)) process = subprocess.Popen( @@ -441,488 +373,89 @@ def close_interactive_instance(self, object_id): def _create_mars_scheduler(self): logger.info("Launching mars scheduler pod for GraphScope ...") - - labels = { - "app.kubernetes.io/name": "graphscope", - "app.kubernetes.io/instance": self._instance_id, - "app.kubernetes.io/version": __version__, - "app.kubernetes.io/external": "mars", - } - - # create mars service - service_builder = ServiceBuilder( - self._mars_service_name, - service_type=self._saved_locals["service_type"], - port=[self._mars_scheduler_port, self._mars_scheduler_web_port], - selector=labels, - ) - self._resource_object.append( - self._core_api.create_namespaced_service( - self._saved_locals["namespace"], service_builder.build() - ) - ) - - # create engine replicaset - scheduler_builder = self._gs_mars_scheduler_builder_cls( - name=self._mars_scheduler_name, - labels=labels, - num_workers=1, - image_pull_policy=self._saved_locals["image_pull_policy"], - ) - # volume1 is for vineyard ipc socket - if self._exists_vineyard_daemonset(self._saved_locals["vineyard_daemonset"]): - vineyard_socket_volume_type = "hostPath" - vineyard_socket_volume_fields = { - "type": "Directory", - "path": "/var/run/vineyard-%s-%s" - % ( - self._saved_locals["namespace"], - self._saved_locals["vineyard_daemonset"], - ), - } - else: - vineyard_socket_volume_type = "emptyDir" - vineyard_socket_volume_fields = {} - scheduler_builder.add_volume( - VolumeBuilder( - name="vineyard-ipc-volume", - volume_type=vineyard_socket_volume_type, - field=vineyard_socket_volume_fields, - mounts_list=[ - {"mountPath": os.path.join(get_tempdir(), "vineyard_workspace")}, - ], - ) - ) - # volume2 is for shared memory - scheduler_builder.add_volume( - VolumeBuilder( - name="host-shm", - volume_type="emptyDir", - field={"medium": "Memory"}, - mounts_list=[{"mountPath": "/dev/shm"}], - ) - ) - # add env - scheduler_builder.add_simple_envs( - { - "GLOG_v": str(self._glog_level), - "VINEYARD_IPC_SOCKET": os.path.join( - get_tempdir(), "vineyard_workspace", "vineyard.sock" - ), - "WITH_VINEYARD": "ON", - } - ) - - # add vineyard container - if not self._exists_vineyard_daemonset( - self._saved_locals["vineyard_daemonset"] - ): - scheduler_builder.add_vineyard_container( - name=self._vineyard_container_name, - image=self._saved_locals["gs_image"], - cpu=self._saved_locals["vineyard_cpu"], - mem=self._saved_locals["vineyard_mem"], - shared_mem=self._saved_locals["vineyard_shared_mem"], - preemptive=self._saved_locals["preemptive"], - etcd_endpoints=self._get_etcd_endpoints(), - port=self._vineyard_service_port, - ) - - # add mars scheduler container - if self._saved_locals["with_mars"]: - scheduler_builder.add_mars_scheduler_container( - name=self._mars_scheduler_container_name, - image=self._saved_locals["gs_image"], - cpu=self._saved_locals["mars_scheduler_cpu"], - mem=self._saved_locals["mars_scheduler_mem"], - preemptive=self._saved_locals["preemptive"], - port=self._mars_scheduler_port, - web_port=self._mars_scheduler_web_port, - ) - for name in self._image_pull_secrets: - scheduler_builder.add_image_pull_secret(name) - - self._resource_object.append( - self._app_api.create_namespaced_replica_set( - self._saved_locals["namespace"], scheduler_builder.build() - ) - ) - - def _create_engine_replicaset(self): - logger.info("Launching GraphScope engines pod ...") - - labels = { - "app.kubernetes.io/name": "graphscope", - "app.kubernetes.io/instance": self._instance_id, - "app.kubernetes.io/version": __version__, - "app.kubernetes.io/component": "engine", - } - - # create engine replicaset - engine_builder = self._gs_engine_builder_cls( - name=self._engine_name, - labels=labels, - num_workers=self._num_workers, - image_pull_policy=self._saved_locals["image_pull_policy"], - ) - if self._engine_pod_node_selector: - engine_builder.add_engine_pod_node_selector(self._engine_pod_node_selector) - # volume1 is for vineyard ipc socket - # MaxGraph: /home/maxgraph/data/vineyard - if self._exists_vineyard_daemonset(self._saved_locals["vineyard_daemonset"]): - vineyard_socket_volume_type = "hostPath" - vineyard_socket_volume_fields = { - "type": "Directory", - "path": "/var/run/vineyard-%s-%s" - % ( - self._saved_locals["namespace"], - self._saved_locals["vineyard_daemonset"], - ), - } - else: - vineyard_socket_volume_type = "emptyDir" - vineyard_socket_volume_fields = {} - engine_builder.add_volume( - VolumeBuilder( - name="vineyard-ipc-volume", - volume_type=vineyard_socket_volume_type, - field=vineyard_socket_volume_fields, - mounts_list=[ - {"mountPath": "/tmp/vineyard_workspace"}, - ], - ) - ) - # volume2 is for shared memory - engine_builder.add_volume( - VolumeBuilder( - name="host-shm", - volume_type="emptyDir", - field={"medium": "Memory"}, - mounts_list=[{"mountPath": "/dev/shm"}], - ) - ) - - # Mount aliyun demo dataset bucket - if self._saved_locals["mount_dataset"] is not None: - self._volumes["dataset"] = { - "type": "emptyDir", - "field": {}, - "mounts": { - "mountPath": self._saved_locals["mount_dataset"], - "readOnly": True, - "mountPropagation": "HostToContainer", - }, - } - - # Mount user specified volumes - for name, volume in self._volumes.items(): - volume_builder = resolve_volume_builder(name, volume) - if volume_builder is not None: - engine_builder.add_volume(volume_builder) - - # add env - env = { - "GLOG_v": str(self._glog_level), - "VINEYARD_IPC_SOCKET": os.path.join( - get_tempdir(), "vineyard_workspace", "vineyard.sock" - ), - "WITH_VINEYARD": "ON", - "PATH": os.environ["PATH"], - "LD_LIBRARY_PATH": os.environ["LD_LIBRARY_PATH"], - "DYLD_LIBRARY_PATH": os.environ["DYLD_LIBRARY_PATH"], - } - if "OPAL_PREFIX" in os.environ: - env.update({"OPAL_PREFIX": os.environ["OPAL_PREFIX"]}) - if "OPAL_BINDIR" in os.environ: - env.update({"OPAL_BINDIR": os.environ["OPAL_BINDIR"]}) - - engine_builder.add_simple_envs(env) - - # add engine container - engine_builder.add_engine_container( - cmd=["tail", "-f", "/dev/null"], - name=self._engine_container_name, - image=self._saved_locals["gs_image"], - cpu=self._saved_locals["engine_cpu"], - mem=self._saved_locals["engine_mem"], - preemptive=self._saved_locals["preemptive"], - ) - - # add vineyard container - if not self._exists_vineyard_daemonset( - self._saved_locals["vineyard_daemonset"] - ): - engine_builder.add_vineyard_container( - name=self._vineyard_container_name, - image=self._saved_locals["gs_image"], - cpu=self._saved_locals["vineyard_cpu"], - mem=self._saved_locals["vineyard_mem"], - shared_mem=self._saved_locals["vineyard_shared_mem"], - preemptive=self._saved_locals["preemptive"], - etcd_endpoints=self._get_etcd_endpoints(), - port=self._vineyard_service_port, - ) - - # add mars worker container - if self._saved_locals["with_mars"]: - engine_builder.add_mars_worker_container( - name=self._mars_worker_container_name, - image=self._saved_locals["gs_image"], - cpu=self._saved_locals["mars_worker_cpu"], - mem=self._saved_locals["mars_worker_mem"], - preemptive=self._saved_locals["preemptive"], - port=self._mars_worker_port, - scheduler_endpoint="%s:%s" - % (self._mars_service_name, self._mars_scheduler_port), - ) - - if self._saved_locals["mount_dataset"]: - engine_builder.add_container( - { - "name": "dataset", - "image": self._saved_locals["dataset_image"], - "imagePullPolicy": self._saved_locals["image_pull_policy"], - "resources": { - "requests": { - "memory": "64Mi", - "cpu": "250m", - }, - "limits": { - "memory": "64Mi", - "cpu": "250m", - }, - }, - "volumeMounts": [ - { - "name": "dataset", - "mountPath": "/dataset", - "mountPropagation": "Bidirectional", - } - ], - "securityContext": {"privileged": True}, - } - ) - for name in self._image_pull_secrets: - engine_builder.add_image_pull_secret(name) - - self._resource_object.append( - self._app_api.create_namespaced_replica_set( - self._saved_locals["namespace"], engine_builder.build() - ) - ) - - def launch_etcd(self): - logger.info("Launching etcd ...") - - labels = { - "app.kubernetes.io/name": "graphscope", - "app.kubernetes.io/instance": self._instance_id, - "app.kubernetes.io/version": __version__, - "app.kubernetes.io/component": "etcd", - } - - # should create service first - service_builder = ServiceBuilder( - self._etcd_service_name, - service_type="ClusterIP", - port=self._etcd_listening_client_port, - selector=labels, - ) - self._resource_object.append( - self._core_api.create_namespaced_service( - self._saved_locals["namespace"], service_builder.build() - ) - ) - - time.sleep(1) - - # create etcd cluster - etcd_builder = self._gs_etcd_builder_cls( - name_prefix=self._etcd_name, - container_name=self._etcd_container_name, - service_name=self._etcd_service_name, - image=self._saved_locals["etcd_image"], - cpu=self._saved_locals["etcd_cpu"], - mem=self._saved_locals["etcd_mem"], - preemptive=self._saved_locals["preemptive"], - labels=labels, - image_pull_policy=self._saved_locals["image_pull_policy"], - num_pods=self._etcd_num_pods, - restart_policy="Always", - image_pull_secrets=self._image_pull_secrets, - listen_peer_service_port=self._etcd_listening_peer_port, - listen_client_service_port=self._etcd_listening_client_port, - ) - if self._etcd_pod_node_selector: - etcd_builder.add_etcd_pod_node_selector(self._etcd_pod_node_selector) - pods, services = etcd_builder.build() - for svc in services: - self._resource_object.append( - self._core_api.create_namespaced_service( - self._saved_locals["namespace"], svc.build() - ) - ) - for pod in pods: - self._resource_object.append( - self._core_api.create_namespaced_pod( - self._saved_locals["namespace"], pod.build() - ) - ) + deployment = self._mars_cluster.get_mars_deployment() + response = self._apps_api.create_namespaced_deployment( + self._namespace, deployment + ) + self._resource_object.append(response) + + def _create_engine_stateful_set(self): + logger.info("Create engine headless services...") + service = self._engine_cluster.get_engine_headless_service() + response = self._core_api.create_namespaced_service(self._namespace, service) + self._resource_object.append(response) + logger.info("Creating engine pods...") + stateful_set = self._engine_cluster.get_engine_stateful_set() + response = self._apps_api.create_namespaced_stateful_set( + self._namespace, stateful_set + ) + self._resource_object.append(response) + + def _create_frontend_deployment(self): + logger.info("Creating frontend pods...") + deployment = self._engine_cluster.get_interactive_frontend_deployment() + response = self._apps_api.create_namespaced_deployment( + self._namespace, deployment + ) + self._resource_object.append(response) + + def _create_frontend_service(self): + logger.info("Creating frontend service...") + service = self._engine_cluster.get_interactive_frontend_service(8233) + response = self._core_api.create_namespaced_service(self._namespace, service) + self._resource_object.append(response) def _create_vineyard_service(self): - # vineyard in engine pod - labels = { - "app.kubernetes.io/name": "graphscope", - "app.kubernetes.io/instance": self._instance_id, - "app.kubernetes.io/version": __version__, - "app.kubernetes.io/component": "engine", - } - - service_builder = ServiceBuilder( - self._vineyard_service_name, - service_type=self._saved_locals["service_type"], - port=self._vineyard_service_port, - selector=labels, - ) - self._resource_object.append( - self._core_api.create_namespaced_service( - self._saved_locals["namespace"], service_builder.build() - ) - ) + logger.info("Creating vineyard service...") + service = self._engine_cluster.get_vineyard_service() + response = self._core_api.create_namespaced_service(self._namespace, service) + self._resource_object.append(response) - def _get_vineyard_service_endpoint(self): - # len(endpoints) >= 1 - endpoints = get_service_endpoints( - api_client=self._api_client, - namespace=self._saved_locals["namespace"], - name=self._vineyard_service_name, - service_type=self._saved_locals["service_type"], + def _create_learning_service(self, object_id): + logger.info("Creating learning service...") + service = self._engine_cluster.get_learning_service( + object_id, self._learning_start_port ) - assert len(endpoints) >= 1 - return endpoints[0] - - def _get_mars_scheduler_service_endpoint(self): - # Always len(endpoints) >= 1 - endpoints = get_service_endpoints( - api_client=self._api_client, - namespace=self._saved_locals["namespace"], - name=self._mars_service_name, - service_type=self._saved_locals["service_type"], - query_port=self._mars_scheduler_web_port, - ) - return endpoints[0] - - def _create_graphlearn_service(self, object_id, start_port, num_workers): - targets = [] - - labels = { - "app.kubernetes.io/name": "graphscope", - "app.kubernetes.io/instance": self._instance_id, - "app.kubernetes.io/version": __version__, - "app.kubernetes.io/component": "engine", - } - - service_builder = ServiceBuilder( - self._gle_service_name_prefix + str(object_id), - service_type=self._saved_locals["service_type"], - port=list(range(start_port, start_port + num_workers)), - selector=labels, - external_traffic_policy="Local", - ) - targets.append( - self._core_api.create_namespaced_service( - self._saved_locals["namespace"], service_builder.build() - ) - ) - self._graphlearn_services[object_id] = targets - self._resource_object.extend(targets) - - def _parse_graphlearn_service_endpoint(self, object_id): - if self._saved_locals["service_type"] == "NodePort": - services = self._core_api.list_namespaced_service( - self._saved_locals["namespace"] - ) - for svc in services.items: - if svc.metadata.name == self._gle_service_name_prefix + str(object_id): - endpoints = [] - for ip, port_spec in zip(self._pod_host_ip_list, svc.spec.ports): - endpoints.append( - ( - "%s:%s" % (ip, port_spec.node_port), - int(port_spec.name.split("-")[-1]), - ) - ) - endpoints.sort(key=lambda ep: ep[1]) - return [ep[0] for ep in endpoints] - elif self._saved_locals["service_type"] == "LoadBalancer": - endpoints = get_service_endpoints( - api_client=self._api_client, - namespace=self._saved_locals["namespace"], - name=self._gle_service_name_prefix + str(object_id), - service_type=self._saved_locals["service_type"], - ) - return endpoints - raise RuntimeError("Get graphlearn service endpoint failed.") + response = self._core_api.create_namespaced_service(self._namespace, service) + self._graphlearn_services[object_id] = response + self._resource_object.append(response) def get_engine_config(self): config = { - "vineyard_service_name": self.get_vineyard_service_name(), - "vineyard_rpc_endpoint": self.get_vineyard_rpc_endpoint(), - "mars_endpoint": self.get_mars_scheduler_endpoint(), + "vineyard_service_name": self._engine_cluster.vineyard_service_name, + "vineyard_rpc_endpoint": self._vineyard_service_endpoint, } + if self._with_mars: + config["mars_endpoint"] = self._mars_service_endpoint return config - def configure_etcd_endpoint(self): - if self._etcd_addrs is None: - self.launch_etcd() - self._etcd_endpoint = self._get_etcd_service_endpoint() - logger.info("etcd cluster created") - else: - self._etcd_endpoint = self._etcd_addrs - logger.info("Using external etcd cluster") - logger.info("etcd endpoint is %s", self._etcd_endpoint) - - def _get_etcd_endpoints(self): - etcd_addrs = [] - if self._etcd_addrs is None: - port = self._etcd_listening_client_port - etcd_addrs.append("%s:%s" % (self._etcd_service_name, port)) - for i in range(self._etcd_num_pods): - etcd_addrs.append("%s-%d:%s" % (self._etcd_name, i, port)) - else: - etcd_addrs = self._etcd_addrs.split(",") - etcd_endpoints = ["http://%s" % i for i in etcd_addrs if i] - return etcd_endpoints - def _create_services(self): - self.configure_etcd_endpoint() - - if self._saved_locals["with_mars"]: - # scheduler used by mars + self._create_engine_stateful_set() + if self._with_interactive: + self._create_frontend_deployment() + # self._create_frontend_service() + if self._with_mars: + # scheduler used by Mars self._create_mars_scheduler() - - logger.info("Creating engine replicaset...") - self._create_engine_replicaset() - if not self._exists_vineyard_daemonset( - self._saved_locals["vineyard_daemonset"] - ): + if self._vineyard_daemonset is None: self._create_vineyard_service() def _waiting_for_services_ready(self): + logger.info("Waiting for services ready...") + selector = "" + namespace = self._namespace start_time = time.time() event_messages = [] - engine_pod_selector = "" while True: - replicasets = self._app_api.list_namespaced_replica_set( - namespace=self._saved_locals["namespace"] - ) + # TODO: Add label selector to filter out deployments. + statefulsets = self._apps_api.list_namespaced_stateful_set(namespace) service_available = False - for rs in replicasets.items: - if rs.metadata.name == self._engine_name: + for rs in statefulsets.items: + if rs.metadata.name == self._engine_cluster.engine_stateful_set_name: # logger.info( - # "Engine pod: {} ready / {} total".format( - # str(rs.status.ready_replicas), self._num_workers - # ) + # "Engine pod: %s ready / %s total", + # rs.status.ready_replicas, + # self._num_workers, # ) if rs.status.ready_replicas == self._num_workers: # service is ready @@ -930,15 +463,11 @@ def _waiting_for_services_ready(self): break # check container status - selector = "" - for k, v in rs.spec.selector.match_labels.items(): - selector += f"{k}={v}," - selector = selector[:-1] # remove last comma - engine_pod_selector = selector + labels = rs.spec.selector.match_labels + selector = ",".join(f"{k}={v}" for k, v in labels.items()) pods = self._core_api.list_namespaced_pod( - namespace=self._saved_locals["namespace"], - label_selector=selector, + namespace=namespace, label_selector=selector ) for pod in pods.items: @@ -946,7 +475,7 @@ def _waiting_for_services_ready(self): field_selector = "involvedObject.name=" + pod_name stream = kube_watch.Watch().stream( self._core_api.list_namespaced_event, - self._saved_locals["namespace"], + namespace, field_selector=field_selector, timeout_seconds=1, ) @@ -956,14 +485,11 @@ def _waiting_for_services_ready(self): event_messages.append(msg) logger.info(msg) if event["object"].reason == "Failed": - raise RuntimeError("Kubernetes event error: ", msg) + raise RuntimeError("Kubernetes event error: " + msg) if service_available: break - if ( - self._saved_locals["timeout_seconds"] - and self._saved_locals["timeout_seconds"] + start_time < time.time() - ): + if self._timeout_seconds + start_time < time.time(): raise TimeoutError("GraphScope Engines launching timeout.") time.sleep(2) @@ -971,80 +497,63 @@ def _waiting_for_services_ready(self): self._pod_ip_list = [] self._pod_host_ip_list = [] pods = self._core_api.list_namespaced_pod( - namespace=self._saved_locals["namespace"], - label_selector=engine_pod_selector, + namespace=namespace, label_selector=selector ) for pod in pods.items: self._pod_name_list.append(pod.metadata.name) self._pod_ip_list.append(pod.status.pod_ip) self._pod_host_ip_list.append(pod.status.host_ip) - assert len(self._pod_ip_list) >= 1 - self._host0 = self._pod_ip_list[0] - self._analytical_engine_endpoint = "{}:{}".format( - self._host0, self._random_analytical_engine_rpc_port + assert len(self._pod_ip_list) > 0 + self._analytical_engine_endpoint = ( + f"{self._pod_ip_list[0]}:{self._random_analytical_engine_rpc_port}" + ) + + self._vineyard_service_endpoint = ( + self._engine_cluster.get_vineyard_service_endpoint(self._api_client) + ) + self.vineyard_internal_endpoint = ( + f"{self._pod_ip_list[0]}:{self._engine_cluster._vineyard_service_port}" ) - # get vineyard service endpoint - self._vineyard_service_endpoint = self._get_vineyard_service_endpoint() - if self._saved_locals["with_mars"]: - self._mars_service_endpoint = ( - "http://" + self._get_mars_scheduler_service_endpoint() - ) logger.info("GraphScope engines pod is ready.") logger.info("Engines pod name list: %s", self._pod_name_list) logger.info("Engines pod ip list: %s", self._pod_ip_list) logger.info("Engines pod host ip list: %s", self._pod_host_ip_list) logger.info("Vineyard service endpoint: %s", self._vineyard_service_endpoint) - if self._saved_locals["with_mars"]: + if self._with_mars: + self._mars_service_endpoint = self._mars_cluster.get_mars_service_endpoint( + self._api_client + ) logger.info("Mars service endpoint: %s", self._mars_service_endpoint) def _dump_resource_object(self): resource = {} - if self._saved_locals["delete_namespace"]: - resource[self._saved_locals["namespace"]] = "Namespace" + if self._delete_namespace: + resource[self._namespace] = "Namespace" else: # coordinator info resource[self._coordinator_name] = "Deployment" resource[self._coordinator_service_name] = "Service" - - self._resource_object.dump_with_extra_resource(resource) - - def _get_etcd_service_endpoint(self): - # Always len(endpoints) >= 1 - endpoints = get_service_endpoints( - api_client=self._api_client, - namespace=self._saved_locals["namespace"], - name=self._etcd_service_name, - service_type="ClusterIP", - ) - return endpoints[0] + self._resource_object.dump(extra_resource=resource) def create_analytical_instance(self): + if not (self._with_analytical or self._with_analytical_java): + raise NotImplementedError("Analytical engine not enabled") logger.info( - "Starting GAE rpc service on {} ...".format( - str(self._analytical_engine_endpoint) - ) + "Starting GAE rpc service on %s ...", self._analytical_engine_endpoint ) # generate and distribute hostfile kube_hosts_path = os.path.join(get_tempdir(), "kube_hosts") with open(kube_hosts_path, "w") as f: for i, pod_ip in enumerate(self._pod_ip_list): - f.write("{} {}\n".format(pod_ip, self._pod_name_list[i])) + f.write(f"{pod_ip} {self._pod_name_list[i]}\n") for pod in self._pod_name_list: - subprocess.check_call( - [ - shutil.which("kubectl"), - "-n", - self._saved_locals["namespace"], - "cp", - kube_hosts_path, - "{}:/tmp/hosts_of_nodes".format(pod), - "-c", - self._engine_container_name, - ] - ) + container = self._engine_cluster.analytical_container_name + cmd = f"kubectl -n {self._namespace} cp {kube_hosts_path} {pod}:/tmp/hosts_of_nodes -c {container}" + cmd = shlex.split(cmd) + subprocess.check_call(cmd) # launch engine rmcp = ResolveMPICmdPrefix(rsh_agent=True) @@ -1053,20 +562,12 @@ def create_analytical_instance(self): cmd.append(ANALYTICAL_ENGINE_PATH) cmd.extend(["--host", "0.0.0.0"]) cmd.extend(["--port", str(self._random_analytical_engine_rpc_port)]) - cmd.extend(["--vineyard_shared_mem", self._saved_locals["vineyard_shared_mem"]]) - if rmcp.openmpi(): - cmd.extend(["-v", str(self._glog_level)]) - else: - mpi_env["GLOG_v"] = str(self._glog_level) + cmd.extend(["-v", str(self._glog_level)]) + mpi_env["GLOG_v"] = str(self._glog_level) - cmd.extend( - [ - "--vineyard_socket", - os.path.join(get_tempdir(), "vineyard_workspace", "vineyard.sock"), - ] - ) - logger.info("Analytical engine launching command: {}".format(" ".join(cmd))) + cmd.extend(["--vineyard_socket", self._engine_cluster.vineyard_ipc_socket]) + logger.info("Analytical engine launching command: %s", " ".join(cmd)) env = os.environ.copy() env["GRAPHSCOPE_HOME"] = GRAPHSCOPE_HOME @@ -1094,130 +595,125 @@ def create_analytical_instance(self): def _delete_dangling_coordinator(self): # delete service - self._core_api.delete_namespaced_service( - name=self._coordinator_service_name, - namespace=self._saved_locals["namespace"], - ) - self._app_api.delete_namespaced_deployment( - name=self._coordinator_name, namespace=self._saved_locals["namespace"] - ) - if self._saved_locals["waiting_for_delete"]: + try: + self._core_api.delete_namespaced_service( + self._coordinator_service_name, self._namespace + ) + except K8SApiException as ex: + if ex.status == 404: + logger.warning( + "coordinator service %s not found", self._coordinator_service_name + ) + else: + logger.exception( + "Deleting dangling coordinator service %s failed", + self._coordinator_service_name, + ) + try: + self._apps_api.delete_namespaced_deployment( + self._coordinator_name, self._namespace + ) + except K8SApiException as ex: + if ex.status == 404: + logger.warning( + "coordinator deployment %s not found", self._coordinator_name + ) + else: + logger.exception( + "Deleting dangling coordinator %s failed", self._coordinator_name + ) + + if self._waiting_for_delete: start_time = time.time() while True: try: - self._app_api.read_namespaced_deployment( - name=self._coordinator_name, - namespace=self._saved_locals["namespace"], + self._apps_api.read_namespaced_deployment( + self._coordinator_name, self._namespace ) except K8SApiException as ex: if ex.status != 404: - logger.error( - "Deleting dangling coordinator {} failed: {}".format( - self._coordinator_name, str(ex) - ) + logger.exception( + "Deleting dangling coordinator %s failed", + self._coordinator_name, ) break else: time.sleep(1) - if time.time() - start_time > self._saved_locals["timeout_seconds"]: + if time.time() - start_time > self._timeout_seconds: logger.error( - "Deleting dangling coordinator {} timeout".format( - self._coordinator_name - ) + "Deleting dangling coordinator %s timeout", + self._coordinator_name, ) - def _exists_vineyard_daemonset(self, release): - # check if vineyard daemonset exists. - if not release: - return False - try: - self._app_api.read_namespaced_daemon_set( - release, self._saved_locals["namespace"] - ) - except K8SApiException: - return False - return True - def start(self): + if self._serving: + return True try: self._create_services() self._waiting_for_services_ready() self._dump_resource_object() - except Exception as e: + self._serving = True + except Exception: # pylint: disable=broad-except time.sleep(1) - logger.error( - "Error when launching GraphScope on kubernetes cluster: %s, with traceback: %s", - repr(e), - traceback.format_exc(), - ) + logger.exception("Error when launching GraphScope on kubernetes cluster") self.stop() return False return True def stop(self, is_dangling=False): - if not self._closed: + if self._serving: + logger.info("Cleaning up kubernetes resources") for target in self._resource_object: delete_kubernetes_object( api_client=self._api_client, target=target, - wait=self._saved_locals["waiting_for_delete"], - timeout_seconds=self._saved_locals["timeout_seconds"], + wait=self._waiting_for_delete, + timeout_seconds=self._timeout_seconds, ) - self._resource_object = [] + self._resource_object.clear() if is_dangling: logger.info("Dangling coordinator detected, cleaning up...") # delete everything inside namespace of graphscope instance - if self._saved_locals["delete_namespace"]: + if self._delete_namespace: # delete namespace created by graphscope - self._core_api.delete_namespace(self._saved_locals["namespace"]) - if self._saved_locals["waiting_for_delete"]: + self._core_api.delete_namespace(self._namespace) + if self._waiting_for_delete: start_time = time.time() while True: try: - self._core_api.read_namespace( - self._saved_locals["namespace"] - ) + self._core_api.read_namespace(self._namespace) except K8SApiException as ex: if ex.status != 404: - logger.error( - "Deleting dangling namespace {} failed: {}".format( - self._saved_locals["namespace"], str(ex) - ) + logger.exception( + "Deleting dangling namespace %s failed", + self._namespace, ) break else: time.sleep(1) - if ( - time.time() - start_time - > self._saved_locals["timeout_seconds"] - ): + if time.time() - start_time > self._timeout_seconds: logger.error( - "Deleting namespace %s timeout" - % self._saved_locals["namespace"] + "Deleting namespace %s timeout", self._namespace ) else: # delete coordinator deployment and service self._delete_dangling_coordinator() - self._closed = True + self._serving = False + logger.info("Kubernetes launcher stopped") def create_learning_instance(self, object_id, handle, config): + if not self._with_learning: + raise NotImplementedError("Learning engine not enabled") # allocate service for ports - self._create_graphlearn_service( - object_id, self._learning_engine_ports_usage, len(self._pod_name_list) - ) - # prepare arguments handle = json.loads(base64.b64decode(handle.encode("utf-8")).decode("utf-8")) hosts = ",".join( [ - "%s:%s" % (pod_name, port) + f"{pod_name}:{port}" for pod_name, port in zip( self._pod_name_list, - range( - self._learning_engine_ports_usage, - self._learning_engine_ports_usage + len(self._pod_name_list), - ), + self._engine_cluster.get_learning_ports(self._learning_start_port), ) ] ) @@ -1227,23 +723,11 @@ def create_learning_instance(self, object_id, handle, config): # launch the server self._learning_instance_processes[object_id] = [] for pod_index, pod in enumerate(self._pod_name_list): - cmd = [ - "kubectl", - "-n", - self._saved_locals["namespace"], - "exec", - "-it", - "-c", - self._engine_container_name, - pod, - "--", - "python3", - "-m" "gscoordinator.learning", - handle, - config, - str(pod_index), - ] + container = self._engine_cluster.learning_container_name + sub_cmd = f"/opt/rh/rh-python38/root/usr/bin/python3 -m gscoordinator.learning {handle} {config} {pod_index}" + cmd = f"kubectl -n {self._namespace} exec -it -c {container} {pod} -- {sub_cmd}" logging.debug("launching learning server: %s", " ".join(cmd)) + cmd = shlex.split(cmd) proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, @@ -1263,34 +747,97 @@ def create_learning_instance(self, object_id, handle, config): self._learning_instance_processes[object_id].append(proc) # update the port usage record - self._learning_engine_ports_usage += len(self._pod_name_list) + self._learning_start_port += len(self._pod_name_list) + # Create Service + self._create_learning_service(object_id) # parse the service hosts and ports - return self._parse_graphlearn_service_endpoint(object_id) + return self._engine_cluster.get_graphlearn_service_endpoint( + self._api_client, object_id, self._pod_host_ip_list + ) def close_learning_instance(self, object_id): if object_id not in self._learning_instance_processes: return - # delete the services - for target in self._graphlearn_services[object_id]: - try: - delete_kubernetes_object( - api_client=self._api_client, - target=target, - wait=self._saved_locals["waiting_for_delete"], - timeout_seconds=self._saved_locals["timeout_seconds"], - ) - except Exception as e: - logger.error( - "Failed to delete graphlearn service for %s, %s", object_id, e - ) + target = self._graphlearn_services[object_id] + try: + delete_kubernetes_object( + api_client=self._api_client, + target=target, + wait=self._waiting_for_delete, + timeout_seconds=self._timeout_seconds, + ) + except Exception: # pylint: disable=broad-except + logger.exception("Failed to delete graphlearn service for %s", object_id) # terminate the process for proc in self._learning_instance_processes[object_id]: try: proc.terminate() proc.wait(1) - except Exception as e: - logger.error("Failed to terminate graphlearn server: %s", e) + except Exception: # pylint: disable=broad-except + logger.exception("Failed to terminate graphlearn server") self._learning_instance_processes[object_id].clear() + + +class ResourceManager(object): + """A class to manager kubernetes object. + + Object managed by this class will dump meta info to disk file + for pod preStop lifecycle management. + + meta info format: + + { + "my-deployment": "Deployment", + "my-service": "Service" + } + """ + + _resource_object_path = os.path.join(get_tempdir(), "resource_object") # fixed + + def __init__(self, api_client): + self._api_client = api_client + self._resource_object = [] + self._meta_info = {} + + def append(self, target): + self._resource_object.append(target) + self._meta_info.update( + get_kubernetes_object_info(api_client=self._api_client, target=target) + ) + self.dump() + + def extend(self, targets): + self._resource_object.extend(targets) + for target in targets: + self._meta_info.update( + get_kubernetes_object_info(api_client=self._api_client, target=target) + ) + self.dump() + + def clear(self): + self._resource_object.clear() + self._meta_info.clear() + + def __str__(self): + return str(self._meta_info) + + def __getitem__(self, index): + return self._resource_object[index] + + def dump(self, extra_resource=None): + """Dump meta info to disk file. + Args: + extra_resource (dict): extra resource to dump. + A typical scenario is dumping meta info of namespace + for coordinator dangling processing. + """ + if extra_resource is not None: + rlt = copy.deepcopy(self._meta_info) + rlt.update(extra_resource) + else: + rlt = self._meta_info + with open(self._resource_object_path, "w") as f: + json.dump(rlt, f) diff --git a/coordinator/gscoordinator/launcher.py b/coordinator/gscoordinator/launcher.py index ee7f5832ce71..ee173132e60a 100644 --- a/coordinator/gscoordinator/launcher.py +++ b/coordinator/gscoordinator/launcher.py @@ -32,6 +32,8 @@ def configure_environ(): # OPAL_PREFIX for openmpi if os.path.isdir(os.path.join(GRAPHSCOPE_HOME, "openmpi")): os.environ["OPAL_PREFIX"] = os.path.join(GRAPHSCOPE_HOME, "openmpi") + if os.path.isdir(os.path.join("/opt", "openmpi")): + os.environ["OPAL_PREFIX"] = os.path.join("/opt", "openmpi") # Darwin is open-mpi if os.path.isdir(os.path.join(GRAPHSCOPE_HOME, "open-mpi")): os.environ["OPAL_PREFIX"] = os.path.join(GRAPHSCOPE_HOME, "open-mpi") @@ -59,6 +61,7 @@ def __init__(self): self._instance_id = None self._num_workers = None self._hosts = "" + self._analytical_engine_process = None self._analytical_engine_endpoint = None self._session_workspace = None configure_environ() @@ -123,6 +126,10 @@ def distribute_file(self, path): def analytical_engine_endpoint(self): return self._analytical_engine_endpoint + @property + def analytical_engine_process(self): + return self._analytical_engine_process + @property def num_workers(self): if self._num_workers is None: diff --git a/coordinator/gscoordinator/local_launcher.py b/coordinator/gscoordinator/local_launcher.py index 3e307f3c58ad..f898d0c501d4 100644 --- a/coordinator/gscoordinator/local_launcher.py +++ b/coordinator/gscoordinator/local_launcher.py @@ -24,6 +24,7 @@ from gscoordinator.utils import ResolveMPICmdPrefix from gscoordinator.utils import get_timestamp from gscoordinator.utils import parse_as_glog_level +from gscoordinator.utils import run_command logger = logging.getLogger("graphscope") @@ -467,18 +468,11 @@ def _stop_subprocess(proc, kill=False) -> None: proc.terminate() def distribute_file(self, path) -> None: - d = os.path.dirname(path) + dir = os.path.dirname(path) for host in self.hosts.split(","): if host not in ("localhost", "127.0.0.1"): - # TODO: handle failure, The error message is in CallProcessError.output as bytes - subprocess.check_output( - [shutil.which("ssh"), host, "mkdir -p {}".format(d)], - stderr=subprocess.STDOUT, - ) - subprocess.check_output( - [shutil.which("scp"), "-r", path, "{}:{}".format(host, path)], - stderr=subprocess.STDOUT, - ) + logger.debug(run_command(f"ssh {host} mkdir -p {dir}")) + logger.debug(run_command(f"scp -r {path} {host}:{path}")) @staticmethod def find_etcd() -> [str]: @@ -512,10 +506,17 @@ def configure_etcd_endpoint(self): logger.info("etcd endpoint is %s", self._etcd_endpoint) def start(self): - # create etcd - self.configure_etcd_endpoint() - # create vineyard - self.launch_vineyard() + try: + # create etcd + self.configure_etcd_endpoint() + # create vineyard + self.launch_vineyard() + except Exception: # pylint: disable=broad-except + time.sleep(1) + logger.exception("Error when launching GraphScope on local") + self.stop() + return False + return True def get_engine_config(self) -> dict: config = { diff --git a/coordinator/gscoordinator/op_executor.py b/coordinator/gscoordinator/op_executor.py index 50e2e85280e1..1fffa21400a0 100644 --- a/coordinator/gscoordinator/op_executor.py +++ b/coordinator/gscoordinator/op_executor.py @@ -251,7 +251,10 @@ def _maybe_compile_app(self, op): if not os.path.isfile(app_lib_path): # compile and distribute compiled_path = self._compile_lib_and_distribute( - compile_app, app_sig, op + compile_app, + app_sig, + op, + self._java_class_path, ) if app_lib_path != compiled_path: msg = f"Computed app library path != compiled path, {app_lib_path} versus {compiled_path}" @@ -274,7 +277,9 @@ def _maybe_register_graph(self, op): if not os.path.isfile(graph_lib_path): # compile and distribute compiled_path = self._compile_lib_and_distribute( - compile_graph_frame, graph_sig, op + compile_graph_frame, + graph_sig, + op, ) if graph_lib_path != compiled_path: raise RuntimeError( @@ -371,7 +376,7 @@ def get_analytical_engine_config(self) -> {}: config["enable_java_sdk"] = "OFF" return config - def _compile_lib_and_distribute(self, compile_func, lib_name, op): + def _compile_lib_and_distribute(self, compile_func, lib_name, op, *args, **kwargs): algo_name = op.attr[types_pb2.APP_ALGO].s.decode("utf-8") if ( types_pb2.GAR in op.attr @@ -381,19 +386,21 @@ def _compile_lib_and_distribute(self, compile_func, lib_name, op): space = self._udf_app_workspace else: space = self._builtin_workspace - app_lib_path, java_jar_path, java_ffi_path, app_type = compile_func( + lib_path, java_jar_path, java_ffi_path, app_type = compile_func( space, lib_name, op.attr, self.get_analytical_engine_config(), - self._java_class_path, + self._launcher, + *args, + **kwargs, ) # for java app compilation, we need to distribute the jar and ffi generated if app_type == "java_pie": self._launcher.distribute_file(java_jar_path) self._launcher.distribute_file(java_ffi_path) - self._launcher.distribute_file(app_lib_path) - return app_lib_path + self._launcher.distribute_file(lib_path) + return lib_path def heart_beat(self, request): return self.analytical_grpc_stub.HeartBeat(request) @@ -645,12 +652,13 @@ def load_subgraph( # only 1 GIE executor on local cluster executor_workers_num = 1 threads_per_executor = self._launcher.num_workers * threads_per_worker + engine_config = self.get_analytical_engine_config() + vineyard_rpc_endpoint = engine_config["vineyard_rpc_endpoint"] else: executor_workers_num = self._launcher.num_workers threads_per_executor = threads_per_worker + vineyard_rpc_endpoint = self._launcher.vineyard_internal_endpoint total_builder_chunks = executor_workers_num * threads_per_executor - engine_config = self.get_analytical_engine_config() - vineyard_rpc_endpoint = engine_config["vineyard_rpc_endpoint"] ( _graph_builder_id, @@ -716,7 +724,10 @@ def _process_data_sink(self, op: op_def_pb2.OpDef): fd = op.attr[types_pb2.FD].s.decode() df = op.attr[types_pb2.VINEYARD_ID].s.decode() engine_config = self.get_analytical_engine_config() - vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] + if self._launcher.type() == types_pb2.HOSTS: + vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] + else: + vineyard_endpoint = self._launcher.vineyard_internal_endpoint vineyard_ipc_socket = engine_config["vineyard_socket"] deployment, hosts = self._launcher.get_vineyard_stream_info() dfstream = vineyard.io.open( @@ -812,7 +823,10 @@ def _process_loader_func(loader, vineyard_endpoint, vineyard_ipc_socket): loader.attr[types_pb2.SOURCE].CopyFrom(utils.s_to_attr(new_source)) engine_config = self.get_analytical_engine_config() - vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] + if self._launcher.type() == types_pb2.HOSTS: + vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] + else: + vineyard_endpoint = self._launcher.vineyard_internal_endpoint vineyard_ipc_socket = engine_config["vineyard_socket"] for loader in op.large_attr.chunk_meta_list.items: diff --git a/coordinator/gscoordinator/template/CMakeLists.template b/coordinator/gscoordinator/template/CMakeLists.template index 0efa55cff692..30d9edcca075 100644 --- a/coordinator/gscoordinator/template/CMakeLists.template +++ b/coordinator/gscoordinator/template/CMakeLists.template @@ -352,7 +352,7 @@ elseif (JAVA_PIE_APP) target_link_libraries(${FRAME_NAME} ${CMAKE_JNI_LINKER_FLAGS}) set_target_properties(${FRAME_NAME} PROPERTIES COMPILE_FLAGS "-fPIC") # 2.Post build after compilation, run run-llvm4jni.sh to generate bitcode, if clang enabled. - if ((CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND RUN_LLVM4JNI_SH) + if ((CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (RUN_LLVM4JNI_SH AND (EXISTS ${RUN_LLVM4JNI_SH}))) add_custom_command(TARGET ${FRAME_NAME} POST_BUILD COMMAND bash ${RUN_LLVM4JNI_SH} "-output" ${LLVM4JNI_OUTPUT} diff --git a/coordinator/gscoordinator/utils.py b/coordinator/gscoordinator/utils.py index 2ec3afa25618..faa84b3e3361 100644 --- a/coordinator/gscoordinator/utils.py +++ b/coordinator/gscoordinator/utils.py @@ -25,6 +25,7 @@ import json import logging import os +import shlex import shutil import subprocess import sys @@ -100,19 +101,17 @@ GRAPHSCOPE_HOME = DEFAULT_GRAPHSCOPE_HOME # resolve from develop source tree +# Here the GRAPHSCOPE_HOME has been set to the root of the source tree, +# So the engine location doesn't need to check again, +# just rely on GRAPHSCOPE_HOME. if GRAPHSCOPE_HOME is None: GRAPHSCOPE_HOME = os.path.join(COORDINATOR_HOME, "..") # ANALYTICAL_ENGINE_HOME # 1) infer from GRAPHSCOPE_HOME -ANALYTICAL_ENGINE_HOME = os.path.join(GRAPHSCOPE_HOME) +ANALYTICAL_ENGINE_HOME = GRAPHSCOPE_HOME ANALYTICAL_ENGINE_PATH = os.path.join(ANALYTICAL_ENGINE_HOME, "bin", "grape_engine") -if not os.path.isfile(ANALYTICAL_ENGINE_PATH): - # try to get analytical engine from build dir - ANALYTICAL_ENGINE_HOME = os.path.join(GRAPHSCOPE_HOME, "analytical_engine") - ANALYTICAL_ENGINE_PATH = os.path.join( - ANALYTICAL_ENGINE_HOME, "build", "grape_engine" - ) + ANALYTICAL_BUILTIN_SPACE = os.path.join(GRAPHSCOPE_HOME, "precompiled", "builtin") # ANALYTICAL_ENGINE_JAVA_HOME @@ -121,29 +120,19 @@ ANALYTICAL_ENGINE_JAVA_RUNTIME_JAR = os.path.join( ANALYTICAL_ENGINE_JAVA_HOME, "lib", - "grape-runtime-{}-shaded.jar".format(__version__), + f"grape-runtime-{__version__}-shaded.jar", ) -ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH = "{}".format(ANALYTICAL_ENGINE_JAVA_RUNTIME_JAR) +ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH = ANALYTICAL_ENGINE_JAVA_RUNTIME_JAR -ANALYTICAL_ENGINE_JAVA_JVM_OPTS = ( - "-Djava.library.path={}/lib -Djava.class.path={}".format( - GRAPHSCOPE_HOME, - ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH, - ) +ANALYTICAL_ENGINE_JAVA_JVM_OPTS = f"-Djava.library.path={GRAPHSCOPE_HOME}/lib" +ANALYTICAL_ENGINE_JAVA_JVM_OPTS += ( + f" -Djava.class.path={ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH}" ) # INTERACTIVE_ENGINE_SCRIPT INTERACTIVE_INSTANCE_TIMEOUT_SECONDS = 120 # 2 mins INTERACTIVE_ENGINE_SCRIPT = os.path.join(GRAPHSCOPE_HOME, "bin", "giectl") -if not os.path.isfile(INTERACTIVE_ENGINE_SCRIPT): - INTERACTIVE_ENGINE_SCRIPT = os.path.join( - GRAPHSCOPE_HOME, ".install_prefix", "bin", "giectl" - ) -if not os.path.isfile(INTERACTIVE_ENGINE_SCRIPT): - INTERACTIVE_ENGINE_SCRIPT = os.path.join( - GRAPHSCOPE_HOME, "interactive_engine", "bin", "giectl" - ) # default threads per worker configuration for GIE/GAIA INTERACTIVE_ENGINE_THREADS_PER_WORKER = 2 @@ -154,7 +143,7 @@ PROCESSOR_MAIN_CLASS = "com.alibaba.graphscope.annotation.Main" JAVA_CODEGEN_OUTPUT_PREFIX = "gs-ffi" GRAPE_PROCESSOR_JAR = os.path.join( - GRAPHSCOPE_HOME, "lib", "grape-runtime-{}-shaded.jar".format(__version__) + GRAPHSCOPE_HOME, "lib", f"grape-runtime-{__version__}-shaded.jar" ) GIRAPH_DRIVER_CLASS = "com.alibaba.graphscope.app.GiraphComputationAdaptor" @@ -254,8 +243,116 @@ def check_java_app_graph_consistency( return True +def run_command(args: str, cwd=None): + logger.info("Running command: %s, cwd: %s", args, cwd) + cp = subprocess.run(shlex.split(args), capture_output=True, cwd=cwd) + if cp.returncode != 0: + err = cp.stderr.decode("ascii") + logger.error( + "Failed to run command: %s, error message is: %s", + args, + err, + ) + raise RuntimeError(f"Failed to run command: {args}, err: {err}") + return cp.stdout.decode("ascii") + + +def delegate_command_to_pod(args: str, pod: str, container: str): + """Delegate a command to a pod. + + Args: + command (str): Command to be delegated. + pod_name (str): Pod name. + namespace (str): Namespace of the pod. + + Returns: + str: Output of the command. + """ + # logger.info("Delegate command to pod: %s, %s, %s", args, pod, container) + args = f'kubectl exec -c {container} {pod} -- bash -c "{args}"' + return run_command(args) + + +def compile_library(commands, workdir, output_name, launcher): + if launcher.type() == types_pb2.K8S: + return _compile_on_kubernetes( + commands, + workdir, + output_name, + launcher.hosts_list[0], + launcher._engine_cluster.analytical_container_name, + ) + elif launcher.type() == types_pb2.HOSTS: + return _compile_on_local(commands, workdir, output_name) + else: + raise RuntimeError(f"Unsupported launcher type: {launcher.type()}") + + +def _compile_on_kubernetes(commands, workdir, output_name, pod, container): + logger.info( + "compile on kubernetes, %s, %s, %s, %s, %s", + commands, + workdir, + output_name, + pod, + container, + ) + try: + full_path = get_lib_path(workdir, output_name) + try: + # The library may exists in the analytical pod. + test_cmd = f"test -f {full_path}" + logger.debug(delegate_command_to_pod(test_cmd, pod, container)) + logger.info("Library exists, skip compilation") + cp = f"kubectl cp {pod}:{full_path} {full_path} -c {container}" + logger.debug(run_command(cp)) + return full_path + except RuntimeError: + pass + parent_dir = os.path.dirname(workdir) + mkdir = f"mkdir -p {parent_dir}" + logger.debug(delegate_command_to_pod(mkdir, pod, container)) + cp = f"kubectl cp {workdir} {pod}:{workdir} -c {container}" + logger.debug(run_command(cp)) + prepend = "source scl_source enable devtoolset-10 rh-python38 &&" + for command in commands: + command = f"{prepend} cd {workdir} && {command}" + logger.debug(delegate_command_to_pod(command, pod, container)) + cp = f"kubectl cp {pod}:{full_path} {full_path} -c {container}" + logger.debug(run_command(cp)) + if not os.path.isfile(full_path): + logger.error("Could not find desired library, found files are:") + logger.error(os.listdir(workdir)) + raise FileNotFoundError(full_path) + except Exception as e: + raise CompilationError(f"Failed to compile {output_name} on kubernetes") from e + return full_path + + +def _compile_on_local(commands, workdir, output_name): + logger.info("compile on local, %s, %s, %s", commands, workdir, output_name) + try: + for command in commands: + logger.debug(run_command(command, cwd=workdir)) + full_path = get_lib_path(workdir, output_name) + if not os.path.isfile(full_path): + logger.error("Could not find desired library") + logger.info(os.listdir(workdir)) + raise FileNotFoundError(full_path) + except Exception as e: + raise CompilationError( + f"Failed to compile {output_name} on platform {get_platform_info()}" + ) from e + return full_path + + def compile_app( - workspace: str, library_name, attr, engine_config: dict, java_class_path: str + workspace: str, + library_name: str, + attr: dict, + engine_config: dict, + launcher, + java_class_path: str, ): """Compile an application. @@ -271,10 +368,11 @@ def compile_app( str: Directory containing generated java and jni code. For c++/python app, return None. str: App type. """ - app_dir = os.path.join(workspace, library_name) - os.makedirs(app_dir, exist_ok=True) + logger.info("Building app library...") + library_dir = os.path.join(workspace, library_name) + os.makedirs(library_dir, exist_ok=True) - _extract_gar(app_dir, attr) + _extract_gar(library_dir, attr) # codegen app and graph info # vd_type and md_type is None in cpp_pie ( @@ -309,11 +407,7 @@ def compile_app( ) check_java_app_graph_consistency(app_class, graph_type, java_app_class) - os.chdir(app_dir) - - extra_options = [] - if types_pb2.CMAKE_EXTRA_OPTIONS in attr: - extra_options = attr[types_pb2.CMAKE_EXTRA_OPTIONS].s.decode("utf-8").split(" ") + os.chdir(library_dir) module_name = "" # Output directory for java codegen @@ -321,28 +415,28 @@ def compile_app( # set OPAL_PREFIX in CMAKE_PREFIX_PATH OPAL_PREFIX = os.environ.get("OPAL_PREFIX", "") cmake_commands = [ - shutil.which("cmake"), + "cmake", ".", f"-DNETWORKX={engine_config['networkx']}", f"-DCMAKE_PREFIX_PATH='{GRAPHSCOPE_HOME};{OPAL_PREFIX}'", ] - cmake_commands.extend(extra_options) + if types_pb2.CMAKE_EXTRA_OPTIONS in attr: + extra_options = attr[types_pb2.CMAKE_EXTRA_OPTIONS].s.decode("utf-8").split(" ") + cmake_commands.extend(extra_options) + if os.environ.get("GRAPHSCOPE_ANALYTICAL_DEBUG", "") == "1": cmake_commands.append("-DCMAKE_BUILD_TYPE=Debug") if app_type == "java_pie": - if not os.path.isfile(GRAPE_PROCESSOR_JAR): - raise RuntimeError("Grape runtime jar not found") # for java need to run preprocess, and the generated files can be reused, # if the fragment & vd type is same. - java_codegen_out_dir = os.path.join( - workspace, "{}-{}".format(JAVA_CODEGEN_OUTPUT_PREFIX, library_name) + workspace, f"{JAVA_CODEGEN_OUTPUT_PREFIX}-{library_name}" ) + # TODO(zhanglei): Could this codegen caching happends on engine side? if os.path.isdir(java_codegen_out_dir): logger.info( - "Desired java codegen directory: {} exists, skip...".format( - java_codegen_out_dir - ) + "Found existing java codegen directory: %s, skipped codegen", + java_codegen_out_dir, ) cmake_commands += ["-DJAVA_APP_CODEGEN=OFF"] else: @@ -350,53 +444,51 @@ def compile_app( cmake_commands += [ "-DENABLE_JAVA_SDK=ON", "-DJAVA_PIE_APP=ON", - "-DPRE_CP={}:{}".format(GRAPE_PROCESSOR_JAR, java_jar_path), - "-DPROCESSOR_MAIN_CLASS={}".format(PROCESSOR_MAIN_CLASS), - "-DJAR_PATH={}".format(java_jar_path), - "-DOUTPUT_DIR={}".format(java_codegen_out_dir), + f"-DPRE_CP={GRAPE_PROCESSOR_JAR}:{java_jar_path}", + f"-DPROCESSOR_MAIN_CLASS={PROCESSOR_MAIN_CLASS}", + f"-DJAR_PATH={java_jar_path}", + f"-DOUTPUT_DIR={java_codegen_out_dir}", ] # if run llvm4jni.sh not found, we just go ahead,since it is optional. - if LLVM4JNI_HOME and os.path.isfile(os.path.join(LLVM4JNI_HOME, "run.sh")): + # The go ahead part moves to `gscoordinator/template/CMakeLists.template` + if LLVM4JNI_HOME: llvm4jni_user_out_dir = os.path.join( - workspace, "{}-{}".format(LLVM4JNI_USER_OUT_DIR_BASE, library_name) + workspace, f"{LLVM4JNI_USER_OUT_DIR_BASE}-{library_name}" ) cmake_commands += [ - "-DRUN_LLVM4JNI_SH={}".format(os.path.join(LLVM4JNI_HOME, "run.sh")), - "-DLLVM4JNI_OUTPUT={}".format(llvm4jni_user_out_dir), - "-DLIB_PATH={}".format(get_lib_path(app_dir, library_name)), + f"-DRUN_LLVM4JNI_SH={os.path.join(LLVM4JNI_HOME, 'run.sh')}", + f"-DLLVM4JNI_OUTPUT={llvm4jni_user_out_dir}", + f"-DLIB_PATH={get_lib_path(library_dir, library_name)}", ] else: logger.info( "Skip running llvm4jni since env var LLVM4JNI_HOME not found or run.sh not found under LLVM4JNI_HOME" ) - logger.info(" ".join(cmake_commands)) elif app_type not in ("cpp_pie", "cpp_pregel"): if app_type == "cython_pregel": pxd_name = "pregel" - cmake_commands += ["-DCYTHON_PREGEL_APP=True"] + cmake_commands += ["-DCYTHON_PREGEL_APP=ON"] if pregel_combine: - cmake_commands += ["-DENABLE_PREGEL_COMBINE=True"] + cmake_commands += ["-DENABLE_PREGEL_COMBINE=ON"] else: pxd_name = "pie" - cmake_commands += ["-DCYTHON_PIE_APP=True"] + cmake_commands += ["-DCYTHON_PIE_APP=ON"] # Copy pxd file and generate cc file from pyx shutil.copyfile( os.path.join(TEMPLATE_DIR, f"{pxd_name}.pxd.template"), - os.path.join(app_dir, f"{pxd_name}.pxd"), + os.path.join(library_dir, f"{pxd_name}.pxd"), ) # Assume the gar will have and only have one .pyx file - for pyx_file in glob.glob(app_dir + "/*.pyx"): + for pyx_file in glob.glob(library_dir + "/*.pyx"): module_name = os.path.splitext(os.path.basename(pyx_file))[0] - cc_file = os.path.join(app_dir, module_name + ".cc") - subprocess.check_call( - [shutil.which("cython"), "-3", "--cplus", "-o", cc_file, pyx_file] - ) + cc_file = os.path.join(library_dir, module_name + ".cc") + subprocess.check_call(["cython", "-3", "--cplus", "-o", cc_file, pyx_file]) app_header = f"{module_name}.h" # replace and generate cmakelist cmakelists_file_tmp = os.path.join(TEMPLATE_DIR, "CMakeLists.template") - cmakelists_file = os.path.join(app_dir, "CMakeLists.txt") + cmakelists_file = os.path.join(library_dir, "CMakeLists.txt") with open(cmakelists_file_tmp, mode="r") as template: content = template.read() content = Template(content).safe_substitute( @@ -415,47 +507,20 @@ def compile_app( f.write(content) # compile - logger.info("Building app ...") - cmake_process = subprocess.Popen( - cmake_commands, - env=os.environ.copy(), - encoding="utf-8", - errors="replace", - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - universal_newlines=True, - bufsize=1, - ) - cmake_stderr_watcher = PipeWatcher(cmake_process.stderr, sys.stderr) - setattr(cmake_process, "stderr_watcher", cmake_stderr_watcher) - cmake_process.wait() + commands = [" ".join(cmake_commands), "make -j2"] + lib_path = compile_library(commands, library_dir, library_name, launcher) - make_process = subprocess.Popen( - [shutil.which("make"), "-j4", "VERBOSE=true"], - env=os.environ.copy(), - encoding="utf-8", - errors="replace", - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - universal_newlines=True, - bufsize=1, - ) - make_stderr_watcher = PipeWatcher(make_process.stderr, sys.stderr) - setattr(make_process, "stderr_watcher", make_stderr_watcher) - make_process.wait() - lib_path = get_lib_path(app_dir, library_name) - if not os.path.isfile(lib_path): - raise CompilationError( - f"Failed to compile app {app_class} on platform {get_platform_info()}" - ) - # TODO(siyuan): Append cmake/make logs to error message when failed. return lib_path, java_jar_path, java_codegen_out_dir, app_type def compile_graph_frame( - workspace: str, library_name, attr: dict, engine_config: dict, java_class_path: str + workspace: str, + library_name: str, + attr: dict, + engine_config: dict, + launcher, ): - """Compile an application. + """Compile a graph. Args: workspace (str): Working dir. @@ -472,18 +537,29 @@ def compile_graph_frame( None: For consistency with compile_app. None: for consistency with compile_app. """ - + logger.info("Building graph library ...") _, graph_class, _ = _codegen_graph_info(attr) library_dir = os.path.join(workspace, library_name) os.makedirs(library_dir, exist_ok=True) - graph_type = attr[types_pb2.GRAPH_TYPE].i + # replace and generate cmakelist + cmakelists_file_tmp = os.path.join(TEMPLATE_DIR, "CMakeLists.template") + cmakelists_file = os.path.join(library_dir, "CMakeLists.txt") + with open(cmakelists_file_tmp, mode="r", encoding="utf-8") as template: + content = template.read() + content = Template(content).safe_substitute( + _analytical_engine_home=ANALYTICAL_ENGINE_HOME, + _frame_name=library_name, + _graph_type=graph_class, + ) + with open(cmakelists_file, mode="w", encoding="utf-8") as f: + f.write(content) # set OPAL_PREFIX in CMAKE_PREFIX_PATH OPAL_PREFIX = os.environ.get("OPAL_PREFIX", "") cmake_commands = [ - shutil.which("cmake"), + "cmake", ".", f"-DNETWORKX={engine_config['networkx']}", f"-DENABLE_JAVA_SDK={engine_config['enable_java_sdk']}", @@ -492,66 +568,21 @@ def compile_graph_frame( if os.environ.get("GRAPHSCOPE_ANALYTICAL_DEBUG", "") == "1": cmake_commands.append("-DCMAKE_BUILD_TYPE=Debug") logger.info("Enable java sdk: %s", engine_config["enable_java_sdk"]) + graph_type = attr[types_pb2.GRAPH_TYPE].i if graph_type == graph_def_pb2.ARROW_PROPERTY: - cmake_commands += ["-DPROPERTY_GRAPH_FRAME=True"] + cmake_commands += ["-DPROPERTY_GRAPH_FRAME=ON"] elif graph_type in ( graph_def_pb2.ARROW_PROJECTED, graph_def_pb2.DYNAMIC_PROJECTED, graph_def_pb2.ARROW_FLATTENED, ): - cmake_commands += ["-DPROJECT_FRAME=True"] + cmake_commands += ["-DPROJECT_FRAME=ON"] else: raise ValueError(f"Illegal graph type: {graph_type}") - # replace and generate cmakelist - cmakelists_file_tmp = os.path.join(TEMPLATE_DIR, "CMakeLists.template") - cmakelists_file = os.path.join(library_dir, "CMakeLists.txt") - with open(cmakelists_file_tmp, mode="r", encoding="utf-8") as template: - content = template.read() - content = Template(content).safe_substitute( - _analytical_engine_home=ANALYTICAL_ENGINE_HOME, - _frame_name=library_name, - _graph_type=graph_class, - ) - with open(cmakelists_file, mode="w", encoding="utf-8") as f: - f.write(content) # compile - logger.info("Building graph library ...") - cmake_process = subprocess.Popen( - cmake_commands, - cwd=library_dir, - env=os.environ.copy(), - encoding="utf-8", - errors="replace", - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - universal_newlines=True, - bufsize=1, - ) - cmake_stderr_watcher = PipeWatcher(cmake_process.stderr, sys.stderr) - setattr(cmake_process, "stderr_watcher", cmake_stderr_watcher) - cmake_process.wait() - - make_process = subprocess.Popen( - [shutil.which("make"), "-j4", "VERBOSE=true"], - cwd=library_dir, - env=os.environ.copy(), - encoding="utf-8", - errors="replace", - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - universal_newlines=True, - bufsize=1, - ) - make_stderr_watcher = PipeWatcher(make_process.stderr, sys.stderr) - setattr(make_process, "stderr_watcher", make_stderr_watcher) - make_process.wait() - lib_path = get_lib_path(library_dir, library_name) - if not os.path.isfile(lib_path): - raise CompilationError( - f"Failed to compile graph {graph_class} at {library_dir} on platform {get_platform_info()}" - ) - # TODO(siyuan): Append cmake/make logs to error message when failed. + commands = [" ".join(cmake_commands), "make -j2"] + lib_path = compile_library(commands, library_dir, library_name, launcher) return lib_path, None, None, None @@ -1418,7 +1449,7 @@ def _probe_for_java_app(attr, java_class_path, real_algo): driver_header = "apps/java_pie/java_pie_projected_parallel_app.h" class_name = "gs::JavaPIEProjectedParallelApp" else: - raise RuntimeError("Not a supported java_app_type: {}".format(_java_app_type)) + raise RuntimeError(f"Not a supported java_app_type: {_java_app_type}") return driver_header, class_name, _vd_type, _frag_param_str @@ -1700,7 +1731,7 @@ class ResolveMPICmdPrefix(object): '-n', '4', '-host', 'h1:2,h2:1,h3:1'] >>> env - {'OMPI_MCA_plm_rsh_agent': '/usr/bin/kube_ssh', # if /usr/bin/kube_ssh in $PATH + {'e': '/usr/local/bin/kube_ssh', # if kube_ssh in $PATH 'OMPI_MCA_btl_vader_single_copy_mechanism': 'none', 'OMPI_MCA_orte_allowed_exit_without_sync': '1'} @@ -1797,7 +1828,7 @@ def resolve(self, num_workers, hosts): env["OMPI_MCA_btl_vader_single_copy_mechanism"] = "none" env["OMPI_MCA_orte_allowed_exit_without_sync"] = "1" # OMPI sends SIGCONT -> SIGTERM -> SIGKILL to the worker process, - # set the following MCA parameter to zero will emilinates the chances + # set the following MCA parameter to zero will eliminate the chances # where the process dies before receiving the SIGTERM and do cleanup. env["OMPI_MCA_odls_base_sigkill_timeout"] = "0" diff --git a/coordinator/requirements.txt b/coordinator/requirements.txt index e7baf7d79937..ae9d95c89502 100644 --- a/coordinator/requirements.txt +++ b/coordinator/requirements.txt @@ -1,4 +1,3 @@ -cmake>=3.21.2 etcd-distro>=3.5.1 graphscope-client>=0.11.0 grpcio<=1.43.0,>=1.40.0 diff --git a/coordinator/setup.py b/coordinator/setup.py index 20d123895f3a..71f45100fc64 100644 --- a/coordinator/setup.py +++ b/coordinator/setup.py @@ -68,7 +68,7 @@ def _get_extra_data(): # # For shrink the package size less than "100M", we split graphscope into # 1) graphscope: libs include *.so, runtime such as 'bin', and full-openmpi - # 2) gs-coordinator: include python releated code of gscoordinator + # 2) gs-coordinator: include python related code of gscoordinator # 3) gs-include: header files # 4) gs-engine: other runtime info such as 'conf', and *.jar # 5) gs-apps: precompiled builtin applications diff --git a/docs/zh/loading_graph.rst b/docs/zh/loading_graph.rst index 3ec724249d50..439f834085f0 100644 --- a/docs/zh/loading_graph.rst +++ b/docs/zh/loading_graph.rst @@ -28,10 +28,10 @@ GraphScope 内置了一组流行的数据集,以及载入他们的工具函数 import graphscope from graphscope.dataset import load_ogbn_mag - sess = graphscope.session(cluster_type='k8s', mount_dataset='/dataset') + sess = graphscope.session(cluster_type='k8s', with_dataset=True) graph = load_ogbn_mag(sess, '/dataset/ogbn_mag_small') -这里,我们首先创建一个会话,然后将数据集桶挂载到 :file:`/dataset`,此路径相对于 Pod 的本地路径。然后我们将会话作为参数传入,路径 :file:`/dataset/ogbn_mag_small` 作为第二个参数。 :file:`/dataset` 是我们通过 `mount_dataset` 的参数指定的挂载路径, `ogbn_mag_small` 是这个数据集所在的文件夹的名字。 +这里,我们首先创建一个会话,然后将数据集桶挂载到 :file:`/dataset`,此路径相对于 Pod 的本地路径。然后我们将会话作为参数传入,路径 :file:`/dataset/ogbn_mag_small` 作为第二个参数。 :file:`/dataset` 是我们通过 `with_dataset` 的参数指定的挂载路径, `ogbn_mag_small` 是这个数据集所在的文件夹的名字。 你可以在 ``_ 找到所有目前支持的数据集,文件中包括详细的介绍和用法。 diff --git a/interactive_engine/assembly/src/bin/graphscope/giectl b/interactive_engine/assembly/src/bin/graphscope/giectl index 7abd83655c5d..d39f0c9451b5 100755 --- a/interactive_engine/assembly/src/bin/graphscope/giectl +++ b/interactive_engine/assembly/src/bin/graphscope/giectl @@ -1,9 +1,7 @@ #!/usr/bin/env bash # # interactive_engine command tool - -set -e -set -o pipefail +set -eo pipefail # color readonly RED="\033[0;31m" readonly YELLOW="\033[1;33m" @@ -255,8 +253,6 @@ create_gremlin_instance_on_k8s() { declare -r frontend_port=$8 declare -r coordinator_name=$9 # deployment name of coordinator - declare -r host_ip=$(hostname -i) - instance_id=${coordinator_name##*-} pod_ips=$(kubectl get pod -lapp.kubernetes.io/component=engine,app.kubernetes.io/instance=${instance_id} -o jsonpath='{.items[*].status.podIP}') @@ -266,8 +262,14 @@ create_gremlin_instance_on_k8s() { done pegasus_hosts=${pegasus_hosts:1} - start_frontend ${GRAPHSCOPE_RUNTIME} ${object_id} ${schema_path} ${pegasus_hosts} \ - ${frontend_port} + frontend_name=$(kubectl get pod -lapp.kubernetes.io/component=frontend,app.kubernetes.io/instance=${instance_id} -o jsonpath='{.items[*].metadata.name}') + + launch_frontend_cmd="GRAPHSCOPE_HOME=${GRAPHSCOPE_HOME} \ + ${GRAPHSCOPE_HOME}/bin/giectl start_frontend \ + ${GRAPHSCOPE_RUNTIME} ${object_id} ${schema_path} ${pegasus_hosts} ${frontend_port}" + kubectl cp ${schema_path} ${frontend_name}:${schema_path} + kubectl exec ${frontend_name} -- /bin/bash -c "${launch_frontend_cmd}" + network_servers="" for pod in ${pod_ips}; do @@ -279,15 +281,18 @@ create_gremlin_instance_on_k8s() { for pod in $(echo ${pod_hosts}) do launch_executor_cmd="GRAPHSCOPE_HOME=${GRAPHSCOPE_HOME} ${GRAPHSCOPE_HOME}/bin/giectl start_executor ${GRAPHSCOPE_RUNTIME} ${object_id} ${_server_id} ${server_size} ${executor_rpc_port} ${network_servers}" - kubectl --namespace=${KUBE_NAMESPACE} exec ${pod} -c ${engine_container} -- /bin/bash -c "${launch_executor_cmd}" + # kubectl exec ${pod} -c ${engine_container} -- sudo mkdir -p /var/log/graphscope + # kubectl exec ${pod} -c ${engine_container} -- sudo chown -R graphscope:graphscope /var/log/graphscope + kubectl exec ${pod} -c ${engine_container} -- /bin/bash -c "${launch_executor_cmd}" (( _server_id+=1 )) done log "Expose gremlin server." + # random from range [50001, 51000) for interactive engine + frontend_external_port=$(( ((RANDOM<<15)|RANDOM) % 1000 + 50000 )) + frontend_deployment_name=$(kubectl get deployment -lapp.kubernetes.io/component=frontend,app.kubernetes.io/instance=${instance_id} -o jsonpath='{.items[*].metadata.name}') if [ "${GREMLIN_EXPOSE}" = "LoadBalancer" ]; then - # random from range [50001, 51000) for interactive engine - frontend_external_port=$(( ((RANDOM<<15)|RANDOM) % 50001 + 51000 )) - kubectl expose deployment ${coordinator_name} --name=gremlin-${object_id} --port=${frontend_external_port} \ + kubectl expose deployment ${frontend_deployment_name} --name=gremlin-${object_id} --port=${frontend_external_port} \ --target-port=${frontend_port} --type=LoadBalancer 1>/dev/null 2>&1 [ $? -eq 0 ] || exit 1 wait_period_seconds=0 @@ -307,7 +312,7 @@ create_gremlin_instance_on_k8s() { else # NodePort service type # expose gremlin service - kubectl expose deployment ${coordinator_name} --name=gremlin-${object_id} --port=${frontend_external_port} \ + kubectl expose deployment ${frontend_deployment_name} --name=gremlin-${object_id} --port=${frontend_external_port} \ --target-port=${frontend_port} --type=NodePort 1>/dev/null 2>&1 [ $? -eq 0 ] || exit 1 wait_period_seconds=0 @@ -327,7 +332,7 @@ create_gremlin_instance_on_k8s() { wait_period_seconds=0 while true do - external_ip=$(kubectl describe pods ${coordinator_name} | grep "Node:" | head -1 | awk -F '[ /]+' '{print $3}') + external_ip=$(kubectl describe pods ${frontend_deployment_name} | grep "Node:" | head -1 | awk -F '[ /]+' '{print $3}') if [ -n "${external_ip}" ]; then break fi @@ -339,8 +344,9 @@ create_gremlin_instance_on_k8s() { sleep 5 done fi - - log "FRONTEND_ENDPOINT:${host_ip}:${frontend_port}" + # currently support only 1 pod. + frontend_ip=$(kubectl get pod -lapp.kubernetes.io/component=frontend,app.kubernetes.io/instance=${instance_id} -o jsonpath='{.items[*].status.podIP}') + log "FRONTEND_ENDPOINT:${frontend_ip}:${frontend_port}" log "FRONTEND_EXTERNAL_ENDPOINT:${external_ip}:${frontend_external_port}" } @@ -386,25 +392,21 @@ close_gremlin_instance_on_k8s() { declare -r object_id=$2 declare -r pod_hosts=$(echo $3 | awk -F"," '{for(i=1;i<=NF;++i) {print $i" "}}') declare -r engine_container=$4 + declare -r instance_id=$5 declare -r pid_dir=${GRAPHSCOPE_RUNTIME}/pid/${object_id} + # delete service + log "delete gremlin service" + kubectl delete service gremlin-${object_id} || true + # kill frontend and coordinator process log "Close frontend process." + frontend_name=$(kubectl get pod -lapp.kubernetes.io/component=frontend,app.kubernetes.io/instance=${instance_id} -o jsonpath='{.items[*].metadata.name}') - declare str=$(cat ${pid_dir}/frontend.pid) - - # The file may have multiple pids, each in a single line - # This will read each line into an array - while read -r pid; do pids+=("$pid"); done <<<"${str}" - - for pid in "${pids[@]}"; do - kill ${pid} || true - done + kill_frontend_cmd="ps -ef | grep ${object_id} | grep -v grep | awk '{print \$2}' | xargs kill -9" + kubectl exec ${frontend_name} -- sh -c "${kill_frontend_cmd}" - # delete service - log "delete gremlin service" - kubectl delete service gremlin-${object_id} || true # kill executor process on engine container. log "Close executor process on engine container." @@ -412,7 +414,7 @@ close_gremlin_instance_on_k8s() { do kill_executor_process_cmd="ps -ef | grep gaia_${object_id}executor | grep -v grep | awk '{print \$2}' | xargs kill -9" - kubectl --namespace=${KUBE_NAMESPACE} exec ${pod} -c ${engine_container} -- sh -c "${kill_executor_process_cmd}" + kubectl exec ${pod} -c ${engine_container} -- sh -c "${kill_executor_process_cmd}" done } diff --git a/interactive_engine/tests/function_test.sh b/interactive_engine/tests/function_test.sh index 2a605b6dd050..3bfe04cd334c 100755 --- a/interactive_engine/tests/function_test.sh +++ b/interactive_engine/tests/function_test.sh @@ -19,10 +19,6 @@ tmp_result="$curdir/tmp_result" function _start { _port=$1 workers=$2 - gs_image=$3 - if [ -z "$gs_image" ]; then - gs_image="registry.cn-hongkong.aliyuncs.com/graphscope/graphscope:${version}" - fi if [ -z "$GS_TEST_DIR" ]; then export GS_TEST_DIR=$curdir/src/main/resources fi @@ -32,7 +28,7 @@ function _start { curl -XPOST http://localhost:${_port} -d 'graphscope.set_option(show_log=True)' curl -XPOST http://localhost:${_port} -d 'from graphscope.framework.loader import Loader' curl -XPOST http://localhost:${_port} -d 'from graphscope.dataset import load_modern_graph' - curl_sess="curl -XPOST http://localhost:${_port} -d 'session = graphscope.session(num_workers=${workers}, k8s_volumes={\"data\": {\"type\": \"hostPath\", \"field\": {\"path\": \"${GS_TEST_DIR}\", \"type\": \"Directory\"}, \"mounts\": {\"mountPath\": \"/testingdata\"}}}, k8s_coordinator_cpu=1.0, k8s_coordinator_mem='\''4Gi'\'', k8s_vineyard_cpu=1.0, k8s_vineyard_mem='\''4Gi'\'', vineyard_shared_mem='\''4Gi'\'', k8s_engine_cpu=1.0, k8s_engine_mem='\''4Gi'\'', k8s_etcd_num_pods=3, k8s_etcd_cpu=2, k8s_gs_image='\''${gs_image}'\'')' --write-out %{http_code} --silent --output ./curl.tmp" + curl_sess="curl -XPOST http://localhost:${_port} -d 'session = graphscope.session(num_workers=${workers}, k8s_volumes={\"data\": {\"type\": \"hostPath\", \"field\": {\"path\": \"${GS_TEST_DIR}\", \"type\": \"Directory\"}, \"mounts\": {\"mountPath\": \"/testingdata\"}}}, k8s_coordinator_cpu=1.0, k8s_coordinator_mem='\''4Gi'\'', k8s_vineyard_cpu=1.0, k8s_vineyard_mem='\''4Gi'\'', vineyard_shared_mem='\''4Gi'\'', k8s_engine_cpu=1.0, k8s_engine_mem='\''4Gi'\'', k8s_image_registry='\''${GS_REGISTRY}'\'', k8s_image_tag='\''${GS_TAG}'\'')' --write-out %{http_code} --silent --output ./curl.tmp" echo $curl_sess code=$(sh -c "$curl_sess") diff --git a/k8s/Makefile b/k8s/Makefile index 6bf9852c192c..9f054d7b1073 100644 --- a/k8s/Makefile +++ b/k8s/Makefile @@ -1,7 +1,6 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) WORKING_DIR := $(dir $(MKFILE_PATH)) DOCKERFILES_DIR := $(WORKING_DIR)/dockerfiles -SHORT_SHA := $(shell git rev-parse --short HEAD) ifeq ($(REGISTRY),) REGISTRY := registry.cn-hongkong.aliyuncs.com @@ -9,6 +8,10 @@ endif VERSION ?= latest VINEYARD_VERSION ?= v0.11.2 +# This is the version of builder base image in most cases, except for graphscope-dev +BUILDER_VERSION ?= $(VINEYARD_VERSION) +# This is the version of runtime base image +RUNTIME_VERSION ?= $(VINEYARD_VERSION) PROFILE ?= release CI ?= false @@ -45,7 +48,7 @@ graphscope-dev: cd $(WORKING_DIR) && \ docker build \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=latest \ + --build-arg BUILDER_VERSION=latest \ --build-arg VINEYARD_VERSION=${VINEYARD_VERSION} \ -t graphscope/graphscope-dev:${VINEYARD_VERSION} \ -f $(DOCKERFILES_DIR)/graphscope-dev.Dockerfile . @@ -62,15 +65,16 @@ vineyard-runtime: cd $(WORKING_DIR) && \ docker build \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ -t graphscope/vineyard-runtime:${VINEYARD_VERSION} \ -f $(DOCKERFILES_DIR)/vineyard-runtime.Dockerfile . coordinator: cd $(WORKING_DIR)/.. && \ - docker build --target coordinator \ + docker build --no-cache --target coordinator \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ + --build-arg CI=${CI} \ -t graphscope/coordinator:${VERSION} \ -f $(DOCKERFILES_DIR)/coordinator.Dockerfile . @@ -78,7 +82,9 @@ analytical: cd $(WORKING_DIR)/.. && \ docker build --target analytical \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ + --build-arg RUNTIME_VERSION=$(RUNTIME_VERSION) \ + --build-arg CI=${CI} \ -t graphscope/analytical:${VERSION} \ -f $(DOCKERFILES_DIR)/analytical.Dockerfile . @@ -86,7 +92,9 @@ analytical-java: cd $(WORKING_DIR)/.. && \ docker build --target analytical-java \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ + --build-arg RUNTIME_VERSION=$(RUNTIME_VERSION) \ + --build-arg CI=${CI} \ -t graphscope/analytical-java:${VERSION} \ -f $(DOCKERFILES_DIR)/analytical.Dockerfile . @@ -94,8 +102,10 @@ interactive-frontend: cd $(WORKING_DIR)/.. && \ docker build --target frontend \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ + --build-arg RUNTIME_VERSION=$(RUNTIME_VERSION) \ --build-arg profile=$(PROFILE) \ + --build-arg CI=${CI} \ -t graphscope/interactive-frontend:${VERSION} \ -f $(DOCKERFILES_DIR)/interactive.Dockerfile . @@ -103,8 +113,10 @@ interactive-executor: cd $(WORKING_DIR)/.. \ && docker build --target executor \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ + --build-arg RUNTIME_VERSION=$(RUNTIME_VERSION) \ --build-arg profile=$(PROFILE) \ + --build-arg CI=${CI} \ -t graphscope/interactive-executor:${VERSION} \ -f $(DOCKERFILES_DIR)/interactive.Dockerfile . @@ -113,7 +125,7 @@ interactive-experimental: cd $(WORKING_DIR)/.. && \ docker build --target experimental \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ -t graphscope/interactive-experimental:${VERSION} \ -f $(DOCKERFILES_DIR)/interactive-experimental.Dockerfile . @@ -121,7 +133,9 @@ learning: cd $(WORKING_DIR)/.. && \ docker build --target learning \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ + --build-arg RUNTIME_VERSION=$(RUNTIME_VERSION) \ + --build-arg CI=${CI} \ -t graphscope/learning:${VERSION} \ -f $(DOCKERFILES_DIR)/learning.Dockerfile . @@ -130,8 +144,8 @@ graphscope-store: docker build \ --progress=$(BUILD_PROGRESS) \ --build-arg REGISTRY=$(REGISTRY) \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ --build-arg profile=${PROFILE} \ - -t graphscope/graphscope-store:${SHORT_SHA} \ + -t graphscope/graphscope-store:${VERSION} \ --network=host \ -f $(DOCKERFILES_DIR)/graphscope-store.Dockerfile . diff --git a/k8s/actions-runner-controller/manylinux/Dockerfile b/k8s/actions-runner-controller/manylinux/Dockerfile index ab28d5451972..db3f635657bb 100644 --- a/k8s/actions-runner-controller/manylinux/Dockerfile +++ b/k8s/actions-runner-controller/manylinux/Dockerfile @@ -1,6 +1,6 @@ ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION +ARG BUILDER_VERSION=latest +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION ARG TARGETPLATFORM ARG RUNNER_VERSION=2.287.1 diff --git a/k8s/actions-runner-controller/manylinux/Makefile b/k8s/actions-runner-controller/manylinux/Makefile index 951d57a778bb..1debb2536214 100644 --- a/k8s/actions-runner-controller/manylinux/Makefile +++ b/k8s/actions-runner-controller/manylinux/Makefile @@ -13,6 +13,7 @@ RUNNER_VERSION ?= 2.287.1 DOCKER_VERSION ?= 20.10.12 VINEYARD_VERSION ?= v0.11.2 +BUILDER_VERSION ?= $(VINEYARD_VERSION) # default list of platforms for which multiarch image is built ifeq (${PLATFORMS}, ) @@ -25,5 +26,5 @@ build: --build-arg TARGETPLATFORM=${TARGETPLATFORM} \ --build-arg RUNNER_VERSION=${RUNNER_VERSION} \ --build-arg DOCKER_VERSION=${DOCKER_VERSION} \ - --build-arg BASE_VERSION=$(VINEYARD_VERSION) \ + --build-arg BUILDER_VERSION=$(BUILDER_VERSION) \ -t ${NAME}:${TAG} . diff --git a/k8s/actions-runner-controller/ubuntu.yaml b/k8s/actions-runner-controller/ubuntu.yaml index 08593f4dd59d..5321c1b99e4f 100644 --- a/k8s/actions-runner-controller/ubuntu.yaml +++ b/k8s/actions-runner-controller/ubuntu.yaml @@ -17,7 +17,7 @@ spec: limits: cpu: "31.0" memory: "100Gi" - ephemeral-storage: 100Gi + ephemeral-storage: 200Gi requests: cpu: "27.0" memory: "60Gi" diff --git a/k8s/build_scripts/build_vineyard.sh b/k8s/build_scripts/build_vineyard.sh index 6d33b02010b2..aa64fc38b22b 100755 --- a/k8s/build_scripts/build_vineyard.sh +++ b/k8s/build_scripts/build_vineyard.sh @@ -11,12 +11,13 @@ cd ${WORKDIR} && \ cmake . -DCMAKE_INSTALL_PREFIX=/opt/vineyard && \ make -j$(nproc) && \ make install && \ + strip /opt/vineyard/bin/run_app && \ rm -rf ${WORKDIR}/libgrape-lite # Vineyard echo "Installing vineyard" cd ${WORKDIR} && \ - git clone -b ${VINEYARD_VERSION:-main} https://github.com/v6d-io/v6d.git --depth=1 && \ + git clone -b v0.11.2 https://github.com/v6d-io/v6d.git --depth=1 && \ pushd v6d && \ git submodule update --init && \ cmake . -DCMAKE_PREFIX_PATH=/opt/vineyard \ @@ -30,5 +31,5 @@ cd ${WORKDIR} && \ python3 setup.py bdist_wheel && \ python3 setup_io.py bdist_wheel && \ pip3 install dist/* && \ - sudo cp -r /opt/vineyard/* /usr/local/ && \ + sudo cp -rs /opt/vineyard/* /usr/local/ && \ rm -rf ${WORKDIR}/v6d diff --git a/k8s/dockerfiles/analytical.Dockerfile b/k8s/dockerfiles/analytical.Dockerfile index 7811a9afe023..ec4a403be4eb 100644 --- a/k8s/dockerfiles/analytical.Dockerfile +++ b/k8s/dockerfiles/analytical.Dockerfile @@ -1,54 +1,79 @@ # Analytical engine ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION AS builder +ARG BUILDER_VERSION=latest +ARG RUNTIME_VERSION=latest +############### BUILDER: ANALYTICAL ####################### +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION AS builder -COPY . /home/graphscope/GraphScope +ARG CI=false -RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope -RUN cd /home/graphscope/GraphScope/ \ - && mkdir /home/graphscope/install \ - && make analytical-install INSTALL_PREFIX=/home/graphscope/install +COPY --chown=graphscope:graphscope . /home/graphscope/GraphScope -############### RUNTIME: GAE ####################### -FROM $REGISTRY/graphscope/vineyard-dev:$BASE_VERSION AS analytical +RUN cd /home/graphscope/GraphScope/ && \ + if [ "${CI}" == "true" ]; then \ + cp -r artifacts/analytical /home/graphscope/install; \ + else \ + export GRAPHSCOPE_HOME=/home/graphscope/install; \ + mkdir ${GRAPHSCOPE_HOME}; \ + make analytical-install INSTALL_PREFIX=${GRAPHSCOPE_HOME}; \ + strip ${GRAPHSCOPE_HOME}/bin/grape_engine; \ + strip ${GRAPHSCOPE_HOME}/lib/*.so; \ + python3 ./k8s/utils/precompile.py --graph --output_dir ${GRAPHSCOPE_HOME}/builtin; \ + strip /home/graphscope/install/builtin/*/*.so; \ + fi -COPY --from=builder /home/graphscope/install /opt/graphscope/ -COPY ./k8s/utils/kube_ssh /usr/local/bin/kube_ssh +############### RUNTIME: ANALYTICAL ####################### +FROM $REGISTRY/graphscope/vineyard-dev:$RUNTIME_VERSION AS analytical + +ENV GRAPHSCOPE_HOME=/opt/graphscope +ENV PATH=$PATH:$GRAPHSCOPE_HOME/bin LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GRAPHSCOPE_HOME/lib + +USER root -ENV GRAPHSCOPE_HOME=/opt/graphscope LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/graphscope/lib +COPY ./k8s/utils/kube_ssh /usr/local/bin/kube_ssh +COPY --from=builder /home/graphscope/install /opt/graphscope/ +RUN mkdir -p /tmp/gs && mv /opt/graphscope/builtin /tmp/gs/builtin && chown -R graphscope:graphscope /tmp/gs +RUN chmod +x /opt/graphscope/bin/grape_engine USER graphscope WORKDIR /home/graphscope -############### RUNTIME: GAE-JAVA ####################### -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION AS builder-java +############### BUILDER: ANALYTICAL-JAVA ####################### +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION AS builder-java -COPY . /home/graphscope/GraphScope +COPY --chown=graphscope:graphscope . /home/graphscope/GraphScope -RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope -RUN cd /home/graphscope/GraphScope/ \ - && mkdir /home/graphscope/install \ - && make analytical-java-install INSTALL_PREFIX=/home/graphscope/install - -FROM vineyardcloudnative/manylinux-llvm:2014-11.0.0 AS llvm +RUN cd /home/graphscope/GraphScope/ && \ + if [ "${CI}" == "true" ]; then \ + cp -r artifacts/analytical-java /home/graphscope/install; \ + else \ + export GRAPHSCOPE_HOME=/home/graphscope/install; \ + mkdir ${GRAPHSCOPE_HOME}; \ + make analytical-java-install INSTALL_PREFIX=${GRAPHSCOPE_HOME}; \ + strip ${GRAPHSCOPE_HOME}/bin/grape_engine; \ + strip ${GRAPHSCOPE_HOME}/lib/*.so; \ + python3 ./k8s/utils/precompile.py --graph --output_dir ${GRAPHSCOPE_HOME}/builtin; \ + strip /home/graphscope/install/builtin/*/*.so; \ + fi -FROM $REGISTRY/graphscope/vineyard-dev:$BASE_VERSION AS analytical-java +############### RUNTIME: ANALYTICAL-JAVA ####################### -COPY --from=builder-java /home/graphscope/install /opt/graphscope/ -COPY ./k8s/utils/kube_ssh /usr/local/bin/kube_ssh - -ENV GRAPHSCOPE_HOME=/opt/graphscope LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/graphscope/lib +FROM vineyardcloudnative/manylinux-llvm:2014-11.0.0 AS llvm +FROM $REGISTRY/graphscope/vineyard-dev:$RUNTIME_VERSION AS analytical-java COPY --from=llvm /opt/llvm11.0.0 /opt/llvm11 ENV LLVM11_HOME=/opt/llvm11 ENV LIBCLANG_PATH=$LLVM11_HOME/lib LLVM_CONFIG_PATH=$LLVM11_HOME/bin/llvm-config -# Installed size: 200M -RUN yum install -y java-1.8.0-openjdk-devel \ - && yum clean all \ - && rm -rf /var/cache/yum +ENV GRAPHSCOPE_HOME=/opt/graphscope +ENV PATH=$PATH:$GRAPHSCOPE_HOME/bin LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GRAPHSCOPE_HOME/lib + +USER root +COPY ./k8s/utils/kube_ssh /usr/local/bin/kube_ssh +COPY --from=builder-java /home/graphscope/install /opt/graphscope/ +RUN mkdir -p /tmp/gs && sudo mv /opt/graphscope/builtin /tmp/gs/builtin && chown -R graphscope:graphscope /tmp/gs +RUN sudo chmod +x /opt/graphscope/bin/* USER graphscope WORKDIR /home/graphscope diff --git a/k8s/dockerfiles/coordinator.Dockerfile b/k8s/dockerfiles/coordinator.Dockerfile index 9949edec4622..efff1bf7699f 100644 --- a/k8s/dockerfiles/coordinator.Dockerfile +++ b/k8s/dockerfiles/coordinator.Dockerfile @@ -1,29 +1,34 @@ # Coordinator of graphscope engines ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION AS builder - -ADD . /home/graphscope/GraphScope - -RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope -RUN cd /home/graphscope/GraphScope \ - && mkdir /home/graphscope/install \ - && make learning-install INSTALL_PREFIX=/home/graphscope/install \ - && python3 -m pip install "numpy==1.18.5" "pandas<1.5.0" "grpcio<=1.43.0,>=1.40.0" "grpcio-tools<=1.43.0,>=1.40.0" wheel \ - && cd /home/graphscope/GraphScope/python \ - && python3 setup.py bdist_wheel \ - && cp dist/*.whl /home/graphscope/install/ \ - && cd /home/graphscope/GraphScope/coordinator \ - && package_name=gs-coordinator python3 setup.py bdist_wheel \ - && cp dist/*.whl /home/graphscope/install/ +ARG BUILDER_VERSION=latest +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION AS builder + +ARG CI=false + +COPY --chown=graphscope:graphscope . /home/graphscope/GraphScope + +RUN cd /home/graphscope/GraphScope/ && \ + if [ "${CI}" == "true" ]; then \ + cp -r artifacts/learning /home/graphscope/install; \ + else \ + mkdir /home/graphscope/install; \ + make learning-install INSTALL_PREFIX=/home/graphscope/install; \ + python3 -m pip install "numpy==1.18.5" "pandas<1.5.0" "grpcio<=1.43.0,>=1.40.0" "grpcio-tools<=1.43.0,>=1.40.0" wheel; \ + cd /home/graphscope/GraphScope/python; \ + python3 setup.py bdist_wheel; \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/graphscope/GraphScope/learning_engine/graph-learn/graphlearn/built/lib; \ + auditwheel repair --plat=manylinux2014_x86_64 dist/*.whl; \ + cp wheelhouse/*.whl /home/graphscope/install/; \ + cd /home/graphscope/GraphScope/coordinator; \ + python3 setup.py bdist_wheel; \ + cp dist/*.whl /home/graphscope/install/; \ + fi ############### RUNTIME: Coordinator ####################### FROM centos:7.9.2009 AS coordinator -COPY --from=builder /home/graphscope/install /opt/graphscope/ - RUN yum install -y centos-release-scl-rh sudo && \ INSTALL_PKGS="rh-python38-python-pip" && \ yum install -y --setopt=tsflags=nodocs $INSTALL_PKGS && \ @@ -33,18 +38,28 @@ RUN yum install -y centos-release-scl-rh sudo && \ SHELL [ "/usr/bin/scl", "enable", "rh-python38" ] -RUN python3 -m pip install /opt/graphscope/*.whl && rm -rf /opt/graphscope +ENV GRAPHSCOPE_HOME=/opt/graphscope +ENV PATH=$PATH:/opt/openmpi/bin +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/openmpi/lib -COPY ./k8s/utils/kube_ssh /usr/local/bin/kube_ssh +RUN useradd -m graphscope -u 1001 \ + && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +RUN sudo mkdir -p /var/log/graphscope \ + && sudo chown -R graphscope:graphscope /var/log/graphscope # kubectl v1.19.2 RUN curl -L -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/v1.19.2/bin/linux/amd64/kubectl RUN chmod +x /usr/local/bin/kubectl -RUN useradd -m graphscope -u 1001 \ - && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +COPY --from=builder /home/graphscope/install /opt/graphscope/ +RUN python3 -m pip install --no-cache-dir /opt/graphscope/*.whl && rm -rf /opt/graphscope/ +COPY --from=builder /opt/openmpi /opt/openmpi + +COPY ./interactive_engine/assembly/src/bin/graphscope/giectl /opt/graphscope/bin/giectl +COPY ./k8s/utils/kube_ssh /usr/local/bin/kube_ssh USER graphscope WORKDIR /home/graphscope -ENTRYPOINT [ "/usr/bin/scl", "enable", "rh-python38", "bash" ] \ No newline at end of file +ENTRYPOINT ["/bin/bash", "-c", "source scl_source enable rh-python38 && $0 $@"] diff --git a/k8s/dockerfiles/graphscope-dev-base.Dockerfile b/k8s/dockerfiles/graphscope-dev-base.Dockerfile index 68c0f74cae03..0db6604d4720 100644 --- a/k8s/dockerfiles/graphscope-dev-base.Dockerfile +++ b/k8s/dockerfiles/graphscope-dev-base.Dockerfile @@ -24,7 +24,7 @@ COPY build_scripts /build_scripts RUN mkdir /download RUN export WORKDIR=/download && bash /build_scripts/build_vineyard_dependencies.sh -RUN export WORKDIR=/download && bash /build_scripts/build_patchelf.sh +# RUN export WORKDIR=/download && bash /build_scripts/build_patchelf.sh RUN export WORKDIR=/download && bash /build_scripts/build_maven.sh ENV PATH=$PATH:/opt/apache-maven-3.8.6/bin RUN rm -rf /build_scripts /download @@ -63,3 +63,4 @@ RUN curl -sf -L https://static.rust-lang.org/rustup.sh | \ rustup component add rustfmt ENV PATH=/home/graphscope/.cargo/bin:$PATH ENV RUST_BACKTRACE=1 + diff --git a/k8s/dockerfiles/graphscope-dev.Dockerfile b/k8s/dockerfiles/graphscope-dev.Dockerfile index 196a0bb42c8d..b877ef6d68fb 100644 --- a/k8s/dockerfiles/graphscope-dev.Dockerfile +++ b/k8s/dockerfiles/graphscope-dev.Dockerfile @@ -2,8 +2,8 @@ # libgrape-lite, vineyard, as well as necessary IO dependencies (e.g., hdfs, oss) ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev-base:$BASE_VERSION +ARG BUILDER_VERSION=latest +FROM $REGISTRY/graphscope/graphscope-dev-base:$BUILDER_VERSION USER root @@ -20,3 +20,4 @@ RUN export WORKDIR=/download && \ RUN rm -rf /build_scripts /download USER graphscope + diff --git a/k8s/dockerfiles/graphscope-store.Dockerfile b/k8s/dockerfiles/graphscope-store.Dockerfile index b83d91fe7c3b..3cc76a3c167e 100644 --- a/k8s/dockerfiles/graphscope-store.Dockerfile +++ b/k8s/dockerfiles/graphscope-store.Dockerfile @@ -1,20 +1,19 @@ ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION as builder +ARG BUILDER_VERSION=latest +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION as builder ARG CI=false -ENV CI=$CI ARG profile=debug ENV profile=$profile -COPY . /home/graphscope/gs -COPY ./interactive_engine/assembly/src/conf/maven.settings.xml /home/graphscope/.m2/settings.xml +COPY --chown=graphscope:graphscope . /home/graphscope/gs + +COPY --chown=graphscope:graphscope ./interactive_engine/assembly/src/conf/maven.settings.xml /home/graphscope/.m2/settings.xml USER graphscope -RUN sudo chown -R $(id -u):$(id -g) /home/graphscope/gs /home/graphscope/.m2 && \ - cd /home/graphscope/gs && \ +RUN cd /home/graphscope/gs && \ echo "install cppkafka" \ && sudo yum update -y && sudo yum install -y librdkafka-devel \ && git clone -b 0.4.0 --single-branch --depth=1 https://github.com/mfontanini/cppkafka.git /tmp/cppkafka \ diff --git a/k8s/dockerfiles/interactive-experimental.Dockerfile b/k8s/dockerfiles/interactive-experimental.Dockerfile index eb3d1c54f015..bcc802cb4ac8 100644 --- a/k8s/dockerfiles/interactive-experimental.Dockerfile +++ b/k8s/dockerfiles/interactive-experimental.Dockerfile @@ -1,12 +1,11 @@ # Interactive engine which uses experimental storage ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION AS builder +ARG BUILDER_VERSION=latest +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION AS builder -ADD . /home/graphscope/GraphScope +COPY --chown=graphscope:graphscope . /home/graphscope/GraphScope -RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope RUN cd /home/graphscope/GraphScope/interactive_engine/compiler \ && make build rpc.target=start_rpc_server_k8s diff --git a/k8s/dockerfiles/interactive.Dockerfile b/k8s/dockerfiles/interactive.Dockerfile index 328dc1b69eb3..faaecb81598c 100644 --- a/k8s/dockerfiles/interactive.Dockerfile +++ b/k8s/dockerfiles/interactive.Dockerfile @@ -1,52 +1,67 @@ # Interactive engine ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION AS builder +ARG BUILDER_VERSION=latest +ARG RUNTIME_VERSION=latest +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION AS builder + +ARG CI=false ARG profile=release ENV profile=$profile -ADD . /home/graphscope/GraphScope +COPY --chown=graphscope:graphscope . /home/graphscope/GraphScope -RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope ENV PATH=$PATH:/opt/maven/apache-maven-3.8.6/bin -RUN cd /home/graphscope/GraphScope/ \ - && mkdir /home/graphscope/install \ - && make interactive-install BUILD_TYPE="$profile" INSTALL_PREFIX=/home/graphscope/install \ - && strip /home/graphscope/install/bin/gaia_executor + +RUN cd /home/graphscope/GraphScope/ && \ + if [ "${CI}" == "true" ]; then \ + cp -r artifacts/interactive /home/graphscope/install; \ + else \ + mkdir /home/graphscope/install; \ + make interactive-install BUILD_TYPE="$profile" INSTALL_PREFIX=/home/graphscope/install; \ + fi ############### RUNTIME: frontend ####################### FROM centos:7.9.2009 AS frontend +ENV GRAPHSCOPE_HOME=/opt/graphscope +ENV PATH=$PATH:$GRAPHSCOPE_HOME/bin LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GRAPHSCOPE_HOME/lib + +RUN yum install -y java-1.8.0-openjdk sudo \ + && yum clean all \ + && rm -rf /var/cache/yum + COPY --from=builder /home/graphscope/install/bin/giectl /opt/graphscope/bin/giectl # vineyard.frontend.properties, log configuration files COPY --from=builder /home/graphscope/install/conf /opt/graphscope/conf # jars, libir_core.so COPY --from=builder /home/graphscope/install/lib /opt/graphscope/lib -ENV GRAPHSCOPE_HOME=/opt/graphscope LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/graphscope/lib - - -RUN yum install -y java-1.8.0-openjdk \ - && yum clean all \ - && rm -rf /var/cache/yum +RUN chmod +x /opt/graphscope/bin/giectl RUN useradd -m graphscope -u 1001 \ && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +RUN sudo mkdir -p /var/log/graphscope \ + && sudo chown -R graphscope:graphscope /var/log/graphscope + USER graphscope WORKDIR /home/graphscope ############### RUNTIME: executor ####################### -FROM $REGISTRY/graphscope/vineyard-runtime:$BASE_VERSION AS executor +FROM $REGISTRY/graphscope/vineyard-runtime:$RUNTIME_VERSION AS executor + +ENV GRAPHSCOPE_HOME=/opt/graphscope +ENV PATH=$PATH:$GRAPHSCOPE_HOME/bin LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GRAPHSCOPE_HOME/lib +ENV RUST_BACKTRACE=1 # gaia_executor, giectl COPY --from=builder /home/graphscope/install/bin /opt/graphscope/bin # vineyard.executor.properties, log configuration files COPY --from=builder /home/graphscope/install/conf /opt/graphscope/conf -ENV GRAPHSCOPE_HOME=/opt/graphscope LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/graphscope/lib +RUN sudo chmod +x /opt/graphscope/bin/* -ENV RUST_BACKTRACE=1 USER graphscope WORKDIR /home/graphscope diff --git a/k8s/dockerfiles/learning.Dockerfile b/k8s/dockerfiles/learning.Dockerfile index 42ab31da649f..dbc5b51c49a2 100644 --- a/k8s/dockerfiles/learning.Dockerfile +++ b/k8s/dockerfiles/learning.Dockerfile @@ -1,27 +1,34 @@ # Learning engine ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/graphscope-dev:$BASE_VERSION AS builder - -ADD . /home/graphscope/GraphScope - -RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope -RUN cd /home/graphscope/GraphScope/ \ - && mkdir /home/graphscope/install \ - && make learning-install INSTALL_PREFIX=/home/graphscope/install \ - && python3 -m pip install "numpy==1.18.5" "pandas<1.5.0" "grpcio<=1.43.0,>=1.40.0" "grpcio-tools<=1.43.0,>=1.40.0" wheel \ - && cd /home/graphscope/GraphScope/python \ - && python3 setup.py bdist_wheel \ - && cp dist/*.whl /home/graphscope/install/ \ - && cd /home/graphscope/GraphScope/coordinator \ - && python3 setup.py bdist_wheel \ - && cp dist/*.whl /home/graphscope/install/ +ARG BUILDER_VERSION=latest +ARG RUNTIME_VERSION=latest +FROM $REGISTRY/graphscope/graphscope-dev:$BUILDER_VERSION AS builder + +ARG CI=false + +COPY --chown=graphscope:graphscope . /home/graphscope/GraphScope + +RUN cd /home/graphscope/GraphScope/ && \ + if [ "${CI}" == "true" ]; then \ + cp -r artifacts/learning /home/graphscope/install; \ + else \ + mkdir /home/graphscope/install; \ + make learning-install INSTALL_PREFIX=/home/graphscope/install; \ + python3 -m pip install "numpy==1.18.5" "pandas<1.5.0" "grpcio<=1.43.0,>=1.40.0" "grpcio-tools<=1.43.0,>=1.40.0" wheel; \ + cd /home/graphscope/GraphScope/python; \ + python3 setup.py bdist_wheel; \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/graphscope/GraphScope/learning_engine/graph-learn/graphlearn/built/lib; \ + auditwheel repair --plat=manylinux2014_x86_64 dist/*.whl; \ + cp wheelhouse/*.whl /home/graphscope/install/; \ + cd /home/graphscope/GraphScope/coordinator; \ + python3 setup.py bdist_wheel; \ + cp dist/*.whl /home/graphscope/install/; \ + fi ############### RUNTIME: GLE ####################### -FROM $REGISTRY/graphscope/vineyard-runtime:$BASE_VERSION AS learning +FROM $REGISTRY/graphscope/vineyard-runtime:$RUNTIME_VERSION AS learning -COPY --from=builder /home/graphscope/install /opt/graphscope USER root @@ -34,11 +41,13 @@ RUN yum install -y centos-release-scl-rh sudo && \ SHELL [ "/usr/bin/scl", "enable", "rh-python38" ] -RUN python3 -m pip install /opt/graphscope/*.whl && rm -rf /opt/graphscope/*.whl +ENV GRAPHSCOPE_HOME=/opt/graphscope +ENV PATH=$PATH:$GRAPHSCOPE_HOME/bin LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GRAPHSCOPE_HOME/lib -ENV GRAPHSCOPE_HOME=/opt/graphscope LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/graphscope/lib +COPY --from=builder /home/graphscope/install /opt/graphscope +RUN python3 -m pip install --no-cache-dir /opt/graphscope/*.whl && rm -rf /opt/graphscope/*.whl USER graphscope WORKDIR /home/graphscope -ENTRYPOINT [ "/usr/bin/scl", "enable", "rh-python38", "bash" ] +ENTRYPOINT ["/bin/bash", "-c", "source scl_source enable rh-python38 && $0 $@"] diff --git a/k8s/dockerfiles/vineyard-dev.Dockerfile b/k8s/dockerfiles/vineyard-dev.Dockerfile index 5d5c33e2d53d..f6e5808d4856 100644 --- a/k8s/dockerfiles/vineyard-dev.Dockerfile +++ b/k8s/dockerfiles/vineyard-dev.Dockerfile @@ -3,14 +3,20 @@ FROM centos:7.9.2009 -RUN yum install -y centos-release-scl-rh perl which sudo wget git libunwind-devel && \ - INSTALL_PKGS="devtoolset-10-gcc-c++ rh-python38-python-pip rh-python38-python-devel" && \ +# shanghai zoneinfo +ENV TZ=Asia/Shanghai +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && \ + echo '$TZ' > /etc/timezone + +# Install java for hadoop +RUN yum install -y centos-release-scl-rh epel-release perl which sudo wget git libunwind-devel java-1.8.0-openjdk && \ + INSTALL_PKGS="devtoolset-10-gcc-c++ rh-python38-python-pip rh-python38-python-devel rapidjson-devel msgpack-devel" && \ yum install -y --setopt=tsflags=nodocs $INSTALL_PKGS && \ yum clean all -y --enablerepo='*' && \ rm -rf /var/cache/yum SHELL [ "/usr/bin/scl", "enable", "devtoolset-10", "rh-python38" ] - +ENV PATH=${PATH}:/opt/rh/devtoolset-10/root/usr/bin:/opt/rh/rh-python38/root/usr/local/bin ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib:/usr/local/lib64 # COPY ./download /download @@ -19,7 +25,7 @@ RUN mkdir /download COPY build_scripts/build_vineyard_dependencies.sh /build_scripts/build_vineyard_dependencies.sh RUN export WORKDIR=/download && bash /build_scripts/build_vineyard_dependencies.sh -RUN python3 -m pip install --no-cache-dir libclang etcd-distro wheel +RUN python3 -m pip install --no-cache-dir libclang wheel ARG VINEYARD_VERSION=main COPY build_scripts/build_vineyard.sh /build_scripts/build_vineyard.sh @@ -28,10 +34,16 @@ RUN export WORKDIR=/download && \ bash /build_scripts/build_vineyard.sh RUN rm -rf /build_scripts /download -# shanghai zoneinfo -ENV TZ=Asia/Shanghai -RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && \ - echo '$TZ' > /etc/timezone +# install hadoop for processing hadoop data source +RUN cd /tmp && \ + curl -LO https://archive.apache.org/dist/hadoop/core/hadoop-2.8.4/hadoop-2.8.4.tar.gz && \ + tar zxf hadoop-2.8.4.tar.gz -C /usr/local && \ + rm -rf /usr/local/hadoop-2.8.4/share/doc/ && \ + rm -rf hadoop-2.8.4.tar.gz + +ENV JAVA_HOME=/usr/lib/jvm/jre-1.8.0 HADOOP_HOME=/usr/local/hadoop-2.8.4 +ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native +ENV PATH=$PATH:$HADOOP_HOME/bin # for programming output RUN localedef -c -f UTF-8 -i en_US en_US.UTF-8 @@ -40,5 +52,15 @@ ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8 RUN useradd -m graphscope -u 1001 \ && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +USER graphscope + +# set the CLASSPATH for hadoop +RUN bash -l -c 'echo export CLASSPATH="$($HADOOP_HOME/bin/hdfs classpath --glob)" >> /home/graphscope/.profile' +ENV PATH=${PATH}:/home/graphscope/.local/bin + +RUN sudo mkdir -p /var/log/graphscope \ + && sudo chown -R graphscope:graphscope /var/log/graphscope + # Enable rh-python, devtoolsets-10 binary -ENTRYPOINT [ "/usr/bin/scl", "enable", "devtoolset-10", "rh-python38", "bash" ] \ No newline at end of file +ENTRYPOINT ["/bin/bash", "-c", "source scl_source enable devtoolset-10 rh-python38 && $0 $@"] + diff --git a/k8s/dockerfiles/vineyard-runtime.Dockerfile b/k8s/dockerfiles/vineyard-runtime.Dockerfile index b44bf5e62d41..579f03ab7ee9 100644 --- a/k8s/dockerfiles/vineyard-runtime.Dockerfile +++ b/k8s/dockerfiles/vineyard-runtime.Dockerfile @@ -2,9 +2,10 @@ # dependencies that could graphscope interactive engine. ARG REGISTRY=registry.cn-hongkong.aliyuncs.com -ARG BASE_VERSION=latest -FROM $REGISTRY/graphscope/vineyard-dev:$BASE_VERSION AS builder +ARG BUILDER_VERSION=latest +FROM $REGISTRY/graphscope/vineyard-dev:$BUILDER_VERSION AS builder +USER root WORKDIR /root RUN mkdir artifacts && \ @@ -18,16 +19,21 @@ RUN mkdir artifacts && \ FROM centos:7 AS runtime COPY --from=builder /root/artifacts/artifacts.tar.gz /root/artifacts.tar.gz - -RUN tar xzf /root/artifacts.tar.gz -C /usr/local/ +COPY --from=builder /opt/openmpi /opt/openmpi +COPY --from=builder /opt/vineyard /opt/vineyard +RUN tar xzf /root/artifacts.tar.gz -C /usr/local/ && rm /root/artifacts.tar.gz ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/lib64 -RUN yum install -y sudo libunwind-devel && \ +RUN yum install -y sudo libunwind-devel libgomp && \ yum clean all -y --enablerepo='*' && \ rm -rf /var/cache/yum RUN useradd -m graphscope -u 1001 \ && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers USER graphscope -WORKDIR /home/graphscope \ No newline at end of file +WORKDIR /home/graphscope + +RUN sudo mkdir -p /var/log/graphscope \ + && sudo chown -R graphscope:graphscope /var/log/graphscope + diff --git a/k8s/internal/Makefile b/k8s/internal/Makefile index 5734f45a998e..a9d0d0785046 100644 --- a/k8s/internal/Makefile +++ b/k8s/internal/Makefile @@ -69,7 +69,7 @@ graphscope-darwin-py3: python3 -m pip install --upgrade setuptools && \ make && \ sudo make install INSTALL_PREFIX=/opt/graphscope && \ - python3 $(WORKING_DIR)/../utils/precompile.py + python3 $(WORKING_DIR)/../utils/precompile.py --graph --app # build and delocate wheel cd $(WORKING_DIR)/../../coordinator && \ export WITH_EXTRA_DATA=ON && \ @@ -97,7 +97,7 @@ graphscope-manylinux2014-py3-nodocker: cd $(WORKING_DIR)/../.. && \ make && \ sudo make install INSTALL_PREFIX=/opt/graphscope && \ - python3 $(WORKING_DIR)/../utils/precompile.py && \ + python3 $(WORKING_DIR)/../utils/precompile.py --graph --app && \ export WITH_EXTRA_DATA=ON && \ cd $(WORKING_DIR)/../../coordinator && \ rm -rf build dist/*.whl && \ diff --git a/k8s/internal/jupyter.Dockerfile b/k8s/internal/jupyter.Dockerfile index 89aa0bfacace..251414361e7f 100644 --- a/k8s/internal/jupyter.Dockerfile +++ b/k8s/internal/jupyter.Dockerfile @@ -17,7 +17,7 @@ RUN sed -i 's/archive.ubuntu.com/mirrors.ustc.edu.cn/g' /etc/apt/sources.list && sed -i 's/security.ubuntu.com/mirrors.ustc.edu.cn/g' /etc/apt/sources.list && \ cat /etc/apt/sources.list && \ apt update -y && apt install -y \ - gcc python3-pip openssh-server sudo wget telnet git vim zip wget && \ + gcc python3-pip openssh-server sudo telnet zip && \ apt clean && rm -fr /var/lib/apt/lists/* # Add graphscope user with user id 1001 diff --git a/k8s/utils/precompile.py b/k8s/utils/precompile.py index 89f0e2bb0e88..0bdd30cafb05 100755 --- a/k8s/utils/precompile.py +++ b/k8s/utils/precompile.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import argparse import sys import hashlib import multiprocessing @@ -26,23 +27,19 @@ def compute_sig(s): COORDINATOR_HOME = Path(gscoordinator.__file__).parent.parent.absolute() except ModuleNotFoundError: COORDINATOR_HOME = Path( - os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "..", "coordinator") - ) - ) + ).resolve() TEMPLATE_DIR = COORDINATOR_HOME / "gscoordinator" / "template" +CMAKELISTS_TEMPLATE = (TEMPLATE_DIR / "CMakeLists.template").resolve() BUILTIN_APP_RESOURCE_PATH = ( COORDINATOR_HOME / "gscoordinator" / "builtin" / "app" / "builtin_app.gar" -) -CMAKELISTS_TEMPLATE = TEMPLATE_DIR / "CMakeLists.template" +).resolve() GRAPHSCOPE_HOME = ( os.environ["GRAPHSCOPE_HOME"] if "GRAPHSCOPE_HOME" in os.environ else "/opt/graphscope" ) -WORKSPACE = Path(os.path.join("/", tempfile.gettempprefix(), "gs", "builtin")) - def cmake_and_make(cmake_commands): try: @@ -438,7 +435,40 @@ def compile_cpp_pie_app(): pool.map(cmake_app, targets) +def parse_sys_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--graph", + action='store_true', + help="Compile graph libraries.", + ) + parser.add_argument( + "--app", + action='store_true', + help="Compile application libraries.", + ) + parser.add_argument( + "--output_dir", + type=str, + default=WORKSPACE, + help="Output directory." + ) + return parser.parse_args() + +WORKSPACE = os.path.join("/", tempfile.gettempprefix(), "gs", "builtin") + if __name__ == "__main__": + args = parse_sys_args() + print("Launching with args", args) + WORKSPACE = args.output_dir + WORKSPACE = Path(WORKSPACE).resolve() + print("Will output libraries to", WORKSPACE) os.makedirs(WORKSPACE, exist_ok=True) - compile_graph() - compile_cpp_pie_app() + if args.graph: + print("compile graph") + compile_graph() + if args.app: + print("compile app") + compile_cpp_pie_app() diff --git a/python/graphscope/client/rpc.py b/python/graphscope/client/rpc.py index 890be1a2c51d..32f99a5c7aa7 100644 --- a/python/graphscope/client/rpc.py +++ b/python/graphscope/client/rpc.py @@ -48,8 +48,8 @@ def __init__(self, launcher, endpoint, reconnect=False): ] self._launcher = launcher self._grpc_utils = GRPCUtils() - self._channel = grpc.insecure_channel(endpoint, options=options) - self._stub = coordinator_service_pb2_grpc.CoordinatorServiceStub(self._channel) + channel = grpc.insecure_channel(endpoint, options=options) + self._stub = coordinator_service_pb2_grpc.CoordinatorServiceStub(channel) self._session_id = None self._logs_fetching_thread = None self._reconnect = reconnect diff --git a/python/graphscope/client/session.py b/python/graphscope/client/session.py index af26f80adf1d..b3f92540ce9c 100755 --- a/python/graphscope/client/session.py +++ b/python/graphscope/client/session.py @@ -261,7 +261,6 @@ class Session(object): you can find all params detail in :meth:`__init__` method. >>> s = graphscope.session( - ... k8s_gs_image="registry.cn-hongkong.aliyuncs.com/graphscope/graphscope:latest", ... k8s_vineyard_cpu=0.1, ... k8s_vineyard_mem="256Mi", ... vineyard_shared_mem="4Gi", @@ -286,9 +285,9 @@ def __init__( preemptive=gs_config.preemptive, k8s_namespace=gs_config.k8s_namespace, k8s_service_type=gs_config.k8s_service_type, - k8s_gs_image=gs_config.k8s_gs_image, - k8s_etcd_image=gs_config.k8s_etcd_image, - k8s_dataset_image=gs_config.k8s_dataset_image, + k8s_image_registry=gs_config.k8s_image_registry, + k8s_image_repository=gs_config.k8s_image_repository, + k8s_image_tag=gs_config.k8s_image_tag, k8s_image_pull_policy=gs_config.k8s_image_pull_policy, k8s_image_pull_secrets=gs_config.k8s_image_pull_secrets, k8s_coordinator_cpu=gs_config.k8s_coordinator_cpu, @@ -296,9 +295,7 @@ def __init__( etcd_addrs=gs_config.etcd_addrs, etcd_listening_client_port=gs_config.etcd_listening_client_port, etcd_listening_peer_port=gs_config.etcd_listening_peer_port, - k8s_etcd_num_pods=gs_config.k8s_etcd_num_pods, - k8s_etcd_cpu=gs_config.k8s_etcd_cpu, - k8s_etcd_mem=gs_config.k8s_etcd_mem, + k8s_vineyard_image=gs_config.k8s_vineyard_image, k8s_vineyard_daemonset=gs_config.k8s_vineyard_daemonset, k8s_vineyard_cpu=gs_config.k8s_vineyard_cpu, k8s_vineyard_mem=gs_config.k8s_vineyard_mem, @@ -310,14 +307,17 @@ def __init__( k8s_mars_scheduler_cpu=gs_config.mars_scheduler_cpu, k8s_mars_scheduler_mem=gs_config.mars_scheduler_mem, k8s_coordinator_pod_node_selector=gs_config.k8s_coordinator_pod_node_selector, - k8s_etcd_pod_node_selector=gs_config.k8s_etcd_pod_node_selector, k8s_engine_pod_node_selector=gs_config.k8s_engine_pod_node_selector, k8s_volumes=gs_config.k8s_volumes, k8s_waiting_for_delete=gs_config.k8s_waiting_for_delete, timeout_seconds=gs_config.timeout_seconds, dangling_timeout_seconds=gs_config.dangling_timeout_seconds, with_mars=gs_config.with_mars, - mount_dataset=gs_config.mount_dataset, + with_analytical=gs_config.with_analytical, + with_analytical_java=gs_config.with_analytical_java, + with_interactive=gs_config.with_interactive, + with_learning=gs_config.with_learning, + with_dataset=gs_config.with_dataset, reconnect=False, hosts=["localhost"], **kw, @@ -359,16 +359,18 @@ def __init__( k8s_service_type (str, optional): Type determines how the GraphScope service is exposed. Valid options are NodePort, and LoadBalancer. Defaults to NodePort. - k8s_gs_image (str, optional): The GraphScope engine's image. + k8s_image_registry (str, optional): The GraphScope image registry. - k8s_etcd_image (str, optional): The image of etcd, which used by vineyard. + k8s_image_repository (str, optional): The GraphScope image repository. - k8s_dataset_image(str, optional): The image which mounts aliyun dataset bucket to local path. + k8s_image_tag (str, optional): The GraphScope image tag. k8s_image_pull_policy (str, optional): Kubernetes image pull policy. Defaults to "IfNotPresent". k8s_image_pull_secrets (list[str], optional): A list of secret name used to authorize pull image. + k8s_vineyard_image (str, optional): The image of vineyard. + k8s_vineyard_daemonset (str, optional): The name of vineyard Helm deployment to use. GraphScope will try to discovery the daemonset from kubernetes cluster, then use it if exists, and fallback to launching a bundled vineyard container otherwise. @@ -390,12 +392,6 @@ def __init__( etcd_addrs (str, optional): The addr of external etcd cluster, with formats like 'etcd01:port,etcd02:port,etcd03:port' - k8s_etcd_num_pods (int, optional): The number of etcd pods. Defaults to 3. - - k8s_etcd_cpu (float, optional): Minimum number of CPU cores request for etcd pod. Defaults to 0.5. - - k8s_etcd_mem (str, optional): Minimum number of memory request for etcd pod. Defaults to '128Mi'. - k8s_mars_worker_cpu (float, optional): Minimum number of CPU cores request for mars worker container. Defaults to 0.5. @@ -412,10 +408,6 @@ def __init__( Node selector to the coordinator pod on k8s. Default is None. See also: https://tinyurl.com/3nx6k7ph - k8s_etcd_pod_node_selector (dict, optional): - Node selector to the etcd pod on k8s. Default is None. - See also: https://tinyurl.com/3nx6k7ph - k8s_engine_pod_node_selector = None Node selector to the engine pod on k8s. Default is None. See also: https://tinyurl.com/3nx6k7ph @@ -423,8 +415,20 @@ def __init__( with_mars (bool, optional): Launch graphscope with mars. Defaults to False. - mount_dataset (str, optional): - Create a container and mount aliyun demo dataset bucket to the path specified by `mount_dataset`. + with_analytical (bool, optional): + Launch graphscope with analytical engine. Defaults to True. + + with_analytical_java (bool, optional): + Launch graphscope with analytical engine with java support. Defaults to False. + + with_interactive (bool, optional): + Launch graphscope with interactive engine. Defaults to True. + + with_learning (bool, optional): + Launch graphscope with learning engine. Defaults to True. + + with_dataset (bool, optional): + Create a container and mount aliyun demo dataset bucket to the path `/dataset`. k8s_volumes (dict, optional): A dict of k8s volume which represents a directory containing data, accessible to the containers in a pod. Defaults to {}. @@ -488,8 +492,6 @@ def __init__( k8s_waiting_for_delete (bool, optional): Waiting for service delete or not. Defaults to False. **kw (dict, optional): Other optional parameters will be put to :code:`**kw`. - - k8s_minikube_vm_driver: Deprecated. - - k8s_client_config (dict, optional): Provide configurable parameters for connecting to remote k8s, which strongly relies on the `kube_config.load_kube_config` function. @@ -538,8 +540,9 @@ def __init__( "preemptive", "k8s_namespace", "k8s_service_type", - "k8s_gs_image", - "k8s_etcd_image", + "k8s_image_registry", + "k8s_image_repository", + "k8s_image_tag", "k8s_image_pull_policy", "k8s_image_pull_secrets", "k8s_coordinator_cpu", @@ -547,9 +550,7 @@ def __init__( "etcd_addrs", "etcd_listening_client_port", "etcd_listening_peer_port", - "k8s_etcd_num_pods", - "k8s_etcd_cpu", - "k8s_etcd_mem", + "k8s_vineyard_image", "k8s_vineyard_daemonset", "k8s_vineyard_cpu", "k8s_vineyard_mem", @@ -561,16 +562,18 @@ def __init__( "k8s_mars_scheduler_cpu", "k8s_mars_scheduler_mem", "k8s_coordinator_pod_node_selector", - "k8s_etcd_pod_node_selector", "k8s_engine_pod_node_selector", "with_mars", + "with_analytical", + "with_analytical_java", + "with_interactive", + "with_learning", "reconnect", "k8s_volumes", "k8s_waiting_for_delete", "timeout_seconds", "dangling_timeout_seconds", - "mount_dataset", - "k8s_dataset_image", + "with_dataset", "hosts", ) self._deprecated_params = ( @@ -604,6 +607,12 @@ def __init__( raise NotImplementedError( "Mars cluster cannot be launched along with local GraphScope deployment" ) + if with_analytical and with_analytical_java: + logger.warning( + "Cannot setup `with_analytical` and `with_analytical_java` at the same time" + ) + logger.warning("Disabled `analytical`.") + self._config_params["with_analytical"] = False # deprecated params handle for param in self._deprecated_params: @@ -617,11 +626,6 @@ def __init__( f"Please use `graphscope.set_option({param}={kw.pop(param, None)})` instead", category=DeprecationWarning, ) - if param == "k8s_vineyard_shared_mem": - warnings.warn( - "Please use 'vineyard_shared_mem' instead", - category=DeprecationWarning, - ) kw.pop(param, None) # update k8s_client_config params @@ -978,11 +982,6 @@ def _connect(self): # try to connect to exist coordinator self._coordinator_endpoint = self._config_params["addr"] elif self._cluster_type == types_pb2.K8S: - if ( - self._config_params["k8s_etcd_image"] is None - or self._config_params["k8s_gs_image"] is None - ): - raise K8sError("None image found.") if isinstance( self._config_params["k8s_client_config"], kube_client.api_client.ApiClient, @@ -1304,6 +1303,10 @@ def set_option(**kwargs): - k8s_mars_scheduler_cpu - k8s_mars_scheduler_mem - with_mars + - with_analytical + - with_analytical_java + - with_interactive + - with_learning - k8s_volumes - k8s_waiting_for_delete - timeout_seconds @@ -1357,6 +1360,10 @@ def get_option(key): - k8s_mars_scheduler_cpu - k8s_mars_scheduler_mem - with_mars + - with_analytical + - with_analytical_java + - with_interactive + - with_learning - k8s_volumes - k8s_waiting_for_delete - timeout_seconds diff --git a/python/graphscope/config.py b/python/graphscope/config.py index 3c1f6abf1eff..69a87ef9ebf8 100644 --- a/python/graphscope/config.py +++ b/python/graphscope/config.py @@ -37,38 +37,40 @@ class GSConfig(object): k8s_namespace = None - # etcd image - k8s_etcd_image = "quay.io/coreos/etcd:v3.4.13" - - # All in one image - k8s_gs_image = f"{registry}/graphscope/graphscope:{__version__}" - - # Coordinator image - # Also could be used as a client image - k8s_coordinator_image = f"{registry}/graphscope/coordinator:{__version__}" - - # Dataset image - k8s_dataset_image = f"{registry}/graphscope/dataset:{__version__}" + # k8s image information + # GraphScope's component has a fixed name, use registry, repository and tag to + # uniquely identify the image. For example, the coordinator image would be + # ${registry}/${repository}/coordinator:${tag} + # The image names of all major components are: + # - coordinator: The coordinator of GraphScope instance. + # - analytical: The analytical engine of GraphScope instance. + # - interactive: The interactive engine of GraphScope instance. + # - learning: The learning engine of GraphScope instance. + # These are utility components for ease of use. + # - dataset: A dataset container with example datasets + # - jupyter: A jupyter notebook container with GraphScope client installed. + k8s_image_registry = "registry.cn-hongkong.aliyuncs.com" + k8s_image_repository = "graphscope" + k8s_image_tag = __version__ # image pull configuration k8s_image_pull_policy = "IfNotPresent" k8s_image_pull_secrets = [] # coordinator resource configuration - k8s_coordinator_cpu = 1.5 - k8s_coordinator_mem = "2Gi" + k8s_coordinator_cpu = 0.5 + k8s_coordinator_mem = "512Mi" # etcd resource configuration etcd_addrs = None etcd_listening_client_port = 2379 etcd_listening_peer_port = 2380 - k8s_etcd_num_pods = 1 - k8s_etcd_cpu = 1.0 - k8s_etcd_mem = "512Mi" # vineyard resource configuration - k8s_vineyard_daemonset = "none" - k8s_vineyard_cpu = 0.2 + # image for vineyard container + k8s_vineyard_image = "vineyardcloudnative/vineyardd:v0.11.2" + k8s_vineyard_daemonset = None + k8s_vineyard_cpu = 0.5 k8s_vineyard_mem = "512Mi" vineyard_shared_mem = "4Gi" @@ -84,11 +86,16 @@ class GSConfig(object): # the node selector can be a dict, see also: https://tinyurl.com/3nx6k7ph k8s_coordinator_pod_node_selector = None - k8s_etcd_pod_node_selector = None k8s_engine_pod_node_selector = None # launch graphscope with mars with_mars = False + with_analytical = True + with_analytical_java = False + with_interactive = True + with_learning = True + # Demo dataset related + with_dataset = False k8s_volumes = {} @@ -108,8 +115,5 @@ class GSConfig(object): # disable dangling check by setting -1. dangling_timeout_seconds = 600 - # Demo dataset related - mount_dataset = None - # download_retries dataset_download_retries = 3 diff --git a/python/graphscope/deploy/kubernetes/cluster.py b/python/graphscope/deploy/kubernetes/cluster.py index 530f3ee4caa4..c038e6b27335 100644 --- a/python/graphscope/deploy/kubernetes/cluster.py +++ b/python/graphscope/deploy/kubernetes/cluster.py @@ -16,7 +16,7 @@ # limitations under the License. # - +import base64 import json import logging import os @@ -29,13 +29,8 @@ from kubernetes.client.rest import ApiException as K8SApiException from graphscope.config import GSConfig as gs_config -from graphscope.deploy.kubernetes.resource_builder import ClusterRoleBindingBuilder -from graphscope.deploy.kubernetes.resource_builder import ClusterRoleBuilder -from graphscope.deploy.kubernetes.resource_builder import GSCoordinatorBuilder -from graphscope.deploy.kubernetes.resource_builder import NamespaceBuilder -from graphscope.deploy.kubernetes.resource_builder import RoleBindingBuilder -from graphscope.deploy.kubernetes.resource_builder import RoleBuilder -from graphscope.deploy.kubernetes.resource_builder import ServiceBuilder +from graphscope.deploy.kubernetes.resource_builder import CoordinatorDeployment +from graphscope.deploy.kubernetes.resource_builder import ResourceBuilder from graphscope.deploy.kubernetes.utils import KubernetesPodWatcher from graphscope.deploy.kubernetes.utils import delete_kubernetes_object from graphscope.deploy.kubernetes.utils import get_service_endpoints @@ -52,22 +47,21 @@ class KubernetesClusterLauncher(Launcher): """Class for setting up GraphScope instance on kubernetes cluster.""" - _coordinator_builder_cls = GSCoordinatorBuilder - _coordinator_name_prefix = "coordinator-" - _coordinator_service_name_prefix = "coordinator-service-" - _coordinator_container_name = "coordinator" _role_name_prefix = "gs-reader-" - _role_binding_name_prefix = "gs-reader-binding-" + _role_binding_name_prefix = f"{_role_name_prefix}binding-" _cluster_role_name_prefix = "gs-cluster-reader-" - _cluster_role_binding_name_prefix = "gs-cluster-reader-binding-" + _cluster_role_binding_name_prefix = f"{_cluster_role_name_prefix}binding-" _random_coordinator_service_port = random.randint(59001, 60000) _url_pattern = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" # noqa: E501 _endpoint_pattern = r"(?:http.*://)?(?P[^:/ ]+).?(?P[0-9]*).*" + _coordinator_container_name = "coordinator" + _coordinator_service_port_name = "coordinator" + def __init__( self, api_client=None, @@ -75,10 +69,12 @@ def __init__( k8s_service_type=None, num_workers=None, preemptive=None, - k8s_gs_image=None, - k8s_etcd_image=None, + k8s_image_registry=None, + k8s_image_repository=None, + k8s_image_tag=None, k8s_image_pull_policy=None, k8s_image_pull_secrets=None, + k8s_vineyard_image=None, k8s_vineyard_daemonset=None, k8s_vineyard_cpu=None, k8s_vineyard_mem=None, @@ -87,42 +83,42 @@ def __init__( k8s_engine_mem=None, k8s_coordinator_cpu=None, k8s_coordinator_mem=None, - etcd_addrs=None, - etcd_listening_client_port=None, - etcd_listening_peer_port=None, - k8s_etcd_num_pods=None, - k8s_etcd_cpu=None, - k8s_etcd_mem=None, k8s_mars_worker_cpu=None, k8s_mars_worker_mem=None, k8s_mars_scheduler_cpu=None, k8s_mars_scheduler_mem=None, k8s_coordinator_pod_node_selector=None, - k8s_etcd_pod_node_selector=None, k8s_engine_pod_node_selector=None, with_mars=None, + with_analytical=None, + with_analytical_java=None, + with_interactive=None, + with_learning=None, k8s_volumes=None, - timeout_seconds=None, + timeout_seconds=600, dangling_timeout_seconds=None, - k8s_waiting_for_delete=None, - mount_dataset=None, - k8s_dataset_image=None, - **kwargs + k8s_waiting_for_delete=False, + with_dataset=False, + **kwargs, ): + super().__init__() self._api_client = api_client self._core_api = kube_client.CoreV1Api(api_client) self._app_api = kube_client.AppsV1Api(api_client) self._rbac_api = kube_client.RbacAuthorizationV1Api(api_client) self._saved_locals = locals() - - self._namespace = self._saved_locals["k8s_namespace"] - self._image_pull_secrets = self._saved_locals["k8s_image_pull_secrets"] + self._service_type = k8s_service_type + self._namespace = k8s_namespace + self._registry = k8s_image_registry + self._repository = k8s_image_repository + self._tag = k8s_image_tag + self._image_pull_policy = k8s_image_pull_policy + self._image_pull_secrets = k8s_image_pull_secrets if self._image_pull_secrets is None: self._image_pull_secrets = [] elif not isinstance(self._image_pull_secrets, list): self._image_pull_secrets = [self._image_pull_secrets] - self._image_pull_secrets_str = ",".join(self._image_pull_secrets) self._instance_id = random_string(6) self._role_name = self._role_name_prefix + self._instance_id @@ -130,34 +126,32 @@ def __init__( self._cluster_role_name = "" self._cluster_role_binding_name = "" - # all resource created inside namsapce + # all resource created inside namespace self._resource_object = [] self._coordinator_name = self._coordinator_name_prefix + self._instance_id - self._coordinator_service_name = ( - self._coordinator_service_name_prefix + self._instance_id - ) - # environment variable - self._coordinator_envs = kwargs.pop("coordinator_envs", dict()) - - if "GS_COORDINATOR_MODULE_NAME" in os.environ: - self._coordinator_module_name = os.environ["GS_COORDINATOR_MODULE_NAME"] - else: - self._coordinator_module_name = "gscoordinator" + self._coordinator_service_name = self._coordinator_name self._closed = False # pods watcher - self._coordinator_pods_watcher = [] + self._coordinator_pods_watcher = None self._logs = [] self._delete_namespace = False + self._labels = { + "app.kubernetes.io/name": "graphscope", + "app.kubernetes.io/instance": self._instance_id, + "app.kubernetes.io/version": __version__, + "app.kubernetes.io/component": "coordinator", + } + def __del__(self): self.stop() def poll(self): - """Check the coordinator pod status, 0 for successed.""" + """Check the coordinator pod status, 0 for success.""" return 0 def get_namespace(self): @@ -177,52 +171,35 @@ def _get_free_namespace(self): if not self._namespace_exist(namespace): return namespace - def _namespace_exist(self, namespace): + def _resource_exist(self, func, *args): try: - self._core_api.read_namespace(namespace) + func(*args) except K8SApiException as e: if e.status != 404: # Not found raise return False return True + def _namespace_exist(self, namespace): + return self._resource_exist(self._core_api.read_namespace, namespace) + def _role_exist(self, namespace, role): - try: - self._rbac_api.read_namespaced_role(name=role, namespace=namespace) - except K8SApiException as e: - if e.status != 404: - raise - return False - return True + return self._resource_exist( + self._rbac_api.read_namespaced_role, role, namespace + ) def _cluster_role_exist(self, cluster_role): - try: - self._rbac_api.read_cluster_role(name=cluster_role) - except K8SApiException as e: - if e.status != 404: - raise - return False - return True + return self._resource_exist(self._rbac_api.read_cluster_role, cluster_role) def _role_binding_exist(self, namespace, role_binding): - try: - self._rbac_api.read_namespaced_role_binding( - name=role_binding, namespace=namespace - ) - except K8SApiException as e: - if e.status != 404: - raise - return False - return True + return self._resource_exist( + self._rbac_api.read_namespaced_role_binding, role_binding, namespace + ) def _cluster_role_binding_exist(self, cluster_role_binding): - try: - self._rbac_api.read_cluster_role_binding(name=cluster_role_binding) - except K8SApiException as e: - if e.status != 404: - raise - return False - return True + return self._resource_exist( + self._rbac_api.read_cluster_role_binding, cluster_role_binding + ) def _create_namespace(self): if self._namespace is None: @@ -231,165 +208,130 @@ def _create_namespace(self): if self._namespace is None: self._namespace = self._get_free_namespace() if not self._namespace_exist(self._namespace): - self._core_api.create_namespace(NamespaceBuilder(self._namespace).build()) + namespace = ResourceBuilder.get_namespace(self._namespace) + self._core_api.create_namespace(namespace) self._delete_namespace = True def _create_role_and_binding(self): - self._cluster_role_name = self._cluster_role_name_prefix + str(self._namespace) - self._cluster_role_binding_name = self._cluster_role_binding_name_prefix + str( - self._namespace + self._cluster_role_name = self._cluster_role_name_prefix + self._namespace + self._cluster_role_binding_name = ( + self._cluster_role_binding_name_prefix + self._namespace ) # create a role and bind to default service account. targets = [] if not self._role_exist(namespace=self._namespace, role=self._role_name): - role_builer = RoleBuilder( + role = ResourceBuilder.get_role( name=self._role_name, namespace=self._namespace, - api_groups="apps,extensions,", - resources="configmaps,deployments,deployments/status,endpoints,events,pods,pods/log,pods/exec,pods/status,services,replicasets", # noqa: E501 + api_groups=",apps,extensions", # The leading comma is necessary, represents for core api group. + resources="configmaps,deployments,deployments/status,statefulsets,statefulsets/status,endpoints,events,pods,pods/log,pods/exec,pods/status,services,replicasets", # noqa: E501 verbs="create,delete,get,update,watch,list", + labels=self._labels, ) - targets.append( - self._rbac_api.create_namespaced_role( - self._namespace, role_builer.build() - ) - ) + ret = self._rbac_api.create_namespaced_role(self._namespace, role) + targets.append(ret) - if not self._role_binding_exist( - namespace=self._namespace, role_binding=self._role_binding_name - ): - role_binding_builder = RoleBindingBuilder( + if not self._role_binding_exist(self._namespace, self._role_binding_name): + role_binding = ResourceBuilder.get_role_binding( name=self._role_binding_name, namespace=self._namespace, role_name=self._role_name, service_account_name="default", + labels=self._labels, ) - targets.append( - self._rbac_api.create_namespaced_role_binding( - self._namespace, role_binding_builder.build() - ) + ret = self._rbac_api.create_namespaced_role_binding( + self._namespace, role_binding ) + targets.append(ret) if self._delete_namespace: # Create clusterRole to delete namespace. if not self._cluster_role_exist(cluster_role=self._cluster_role_name): - cluster_role_builder = ClusterRoleBuilder( + cluster_role = ResourceBuilder.get_cluster_role( name=self._cluster_role_name, - api_groups="apps,", + api_groups="apps", resources="namespaces", verbs="create,delete,get,update,watch,list", + labels=self._labels, ) - targets.append( - self._rbac_api.create_cluster_role(cluster_role_builder.build()) - ) + ret = self._rbac_api.create_cluster_role(cluster_role) + targets.append(ret) if not self._cluster_role_binding_exist( cluster_role_binding=self._cluster_role_binding_name ): - cluster_role_binding_builder = ClusterRoleBindingBuilder( + cluster_role_binding = ResourceBuilder.get_cluster_role_binding( name=self._cluster_role_binding_name, namespace=self._namespace, - cluster_role_name=self._cluster_role_name, + role_name=self._cluster_role_name, service_account_name="default", + labels=self._labels, ) - targets.append( - self._rbac_api.create_cluster_role_binding( - cluster_role_binding_builder.build() - ) - ) - + ret = self._rbac_api.create_cluster_role_binding(cluster_role_binding) + targets.append(ret) self._resource_object.extend(targets) def _create_coordinator(self): logger.info("Launching coordinator...") targets = [] - labels = { - "app.kubernetes.io/name": "graphscope", - "app.kubernetes.io/instance": self._instance_id, - "app.kubernetes.io/version": __version__, - "app.kubernetes.io/component": "coordinator", - } - - # create coordinator service - service_builder = ServiceBuilder( - self._coordinator_service_name, - service_type=self._saved_locals["k8s_service_type"], - port=self._random_coordinator_service_port, - selector=labels, - ) - targets.append( - self._core_api.create_namespaced_service( - self._namespace, service_builder.build() - ) - ) - - time.sleep(1) - - # create coordinator deployment - coordinator_builder = self._coordinator_builder_cls( - name=self._coordinator_name, - labels=labels, - replicas=1, - image_pull_policy=self._saved_locals["k8s_image_pull_policy"], - ) - # enable host network - if "ENABLE_HOST_NETWORK" in os.environ: - coordinator_builder.host_network = True - - for name in self._image_pull_secrets: - coordinator_builder.add_image_pull_secret(name) - - envs = { + env = { "PYTHONUNBUFFERED": "TRUE", "KUBE_NAMESPACE": self._namespace, "INSTANCE_ID": self._instance_id, - "GREMLIN_EXPOSE": self._saved_locals["k8s_service_type"], + "GREMLIN_EXPOSE": self._service_type, } if "KUBE_API_ADDRESS" in os.environ: - envs.update({"KUBE_API_ADDRESS": os.environ["KUBE_API_ADDRESS"]}) + env["KUBE_API_ADDRESS"] = os.environ["KUBE_API_ADDRESS"] + if self._registry: + image_prefix = f"{self._registry}/{self._repository}" + else: + image_prefix = self._repository + image = f"{image_prefix}/coordinator:{self._tag}" + args = self._get_coordinator_args() - coordinator_builder.add_simple_envs(envs) + image_pull_policy = self._saved_locals["k8s_image_pull_policy"] + host_network = "ENABLE_HOST_NETWORK" in os.environ + node_selector = self._saved_locals["k8s_coordinator_pod_node_selector"] + port = self._random_coordinator_service_port - if "GSCOORDINATOR_PORTS" in os.environ: - # a list of port, comma separated - # e.g. 50001,50002,50003,50004 - ports = [int(p) for p in os.environ["GSCOORDINATOR_PORTS"].split(",")] - else: - ports = [self._random_coordinator_service_port] - - coordinator_builder.add_coordinator_container( - cmd=["/bin/bash"], - args=self._build_coordinator_cmd(), - name=self._coordinator_container_name, - image=self._saved_locals["k8s_gs_image"], - cpu=self._saved_locals["k8s_coordinator_cpu"], - mem=self._saved_locals["k8s_coordinator_mem"], - preemptive=self._saved_locals["preemptive"], - ports=ports, - module_name=self._coordinator_module_name, + coordinator = CoordinatorDeployment( + namespace=self._namespace, + name=self._coordinator_name, + image=image, + args=args, + labels=self._labels, + image_pull_secret=self._image_pull_secrets, + image_pull_policy=image_pull_policy, + node_selector=node_selector, + env=env, + host_network=host_network, + port=port, ) - if self._saved_locals["k8s_coordinator_pod_node_selector"] is not None: - coordinator_builder.add_coordinator_pod_node_selector( - self._saved_locals["k8s_coordinator_pod_node_selector"] - ) - targets.append( - self._app_api.create_namespaced_deployment( - self._namespace, coordinator_builder.build() - ) + deployment = coordinator.get_coordinator_deployment() + response = self._app_api.create_namespaced_deployment( + self._namespace, deployment ) + targets.append(response) + + # create coordinator service + service = coordinator.get_coordinator_service( + service_type=self._service_type, port=port + ) + response = self._core_api.create_namespaced_service(self._namespace, service) + targets.append(response) self._resource_object.extend(targets) - def _build_coordinator_cmd(self): - cmd = [ - "unset", - "LD_PRELOAD", - "&&", + def base64_encode(self, string): + return base64.b64encode(string.encode("utf-8")).decode("utf-8") + + def _get_coordinator_args(self): + args = [ "python3", "-m", - self._coordinator_module_name, + "gscoordinator", "--cluster_type", "k8s", "--port", @@ -405,49 +347,47 @@ def _build_coordinator_cmd(self): "--k8s_namespace", self._namespace, "--k8s_service_type", - str(self._saved_locals["k8s_service_type"]), - "--k8s_gs_image", - self._saved_locals["k8s_gs_image"], - "--k8s_etcd_image", - self._saved_locals["k8s_etcd_image"], + self._service_type, + "--k8s_image_repository", + self._repository, "--k8s_image_pull_policy", - self._saved_locals["k8s_image_pull_policy"], - "--k8s_image_pull_secrets", - self._image_pull_secrets_str if self._image_pull_secrets_str else '""', + self._image_pull_policy, "--k8s_coordinator_name", self._coordinator_name, "--k8s_coordinator_service_name", self._coordinator_service_name, - "--k8s_etcd_num_pods", - str(self._saved_locals["k8s_etcd_num_pods"]), - "--k8s_etcd_cpu", - str(self._saved_locals["k8s_etcd_cpu"]), - "--k8s_etcd_mem", - self._saved_locals["k8s_etcd_mem"], - "--k8s_vineyard_daemonset", - str(self._saved_locals["k8s_vineyard_daemonset"]), + "--k8s_vineyard_image", + self._saved_locals["k8s_vineyard_image"], "--k8s_vineyard_cpu", str(self._saved_locals["k8s_vineyard_cpu"]), "--k8s_vineyard_mem", - self._saved_locals["k8s_vineyard_mem"], + str(self._saved_locals["k8s_vineyard_mem"]), "--vineyard_shared_mem", - self._saved_locals["vineyard_shared_mem"], + str(self._saved_locals["vineyard_shared_mem"]), "--k8s_engine_cpu", str(self._saved_locals["k8s_engine_cpu"]), "--k8s_engine_mem", - self._saved_locals["k8s_engine_mem"], + str(self._saved_locals["k8s_engine_mem"]), "--k8s_mars_worker_cpu", str(self._saved_locals["k8s_mars_worker_cpu"]), "--k8s_mars_worker_mem", - self._saved_locals["k8s_mars_worker_mem"], + str(self._saved_locals["k8s_mars_worker_mem"]), "--k8s_mars_scheduler_cpu", str(self._saved_locals["k8s_mars_scheduler_cpu"]), "--k8s_mars_scheduler_mem", - self._saved_locals["k8s_mars_scheduler_mem"], + str(self._saved_locals["k8s_mars_scheduler_mem"]), "--k8s_with_mars", str(self._saved_locals["with_mars"]), - "--k8s_volumes", - "'{0}'".format(json.dumps(self._saved_locals["k8s_volumes"])), + "--k8s_with_analytical", + str(self._saved_locals["with_analytical"]), + "--k8s_with_analytical_java", + str(self._saved_locals["with_analytical_java"]), + "--k8s_with_interactive", + str(self._saved_locals["with_interactive"]), + "--k8s_with_learning", + str(self._saved_locals["with_learning"]), + "--k8s_with_dataset", + str(self._saved_locals["with_dataset"]), "--timeout_seconds", str(self._saved_locals["timeout_seconds"]), "--dangling_timeout_seconds", @@ -457,78 +397,76 @@ def _build_coordinator_cmd(self): "--k8s_delete_namespace", str(self._delete_namespace), ] - if self._saved_locals["mount_dataset"] is not None: - cmd.extend( + if self._registry: + args.extend( + [ + "--k8s_image_registry", + self._registry, + ] + ) + if self._tag: + args.extend( [ - "--mount_dataset", - self._saved_locals["mount_dataset"], - "--k8s_dataset_image", - self._saved_locals["k8s_dataset_image"], + "--k8s_image_tag", + self._tag, ] ) - if self._saved_locals["etcd_addrs"] is not None: - cmd.extend(["--etcd_addrs", self._saved_locals["etcd_addrs"]]) - if self._saved_locals["etcd_listening_client_port"] is not None: - cmd.extend( + if self._image_pull_secrets: + args.extend( [ - "--etcd_listening_client_port", - str(self._saved_locals["etcd_listening_client_port"]), + "--k8s_image_pull_secrets", + ",".join(self._image_pull_secrets), ] ) - if self._saved_locals["etcd_listening_peer_port"] is not None: - cmd.extend( + volumes = self._saved_locals["k8s_volumes"] + if volumes: + args.extend( [ - "--etcd_listening_peer_port", - str(self._saved_locals["etcd_listening_peer_port"]), + "--k8s_volumes", + f"{self.base64_encode(json.dumps(volumes))}", ] ) - if self._saved_locals["k8s_etcd_pod_node_selector"] is not None: - cmd.extend( + if self._saved_locals["k8s_vineyard_daemonset"] is not None: + args.extend( [ - "--k8s_etcd_pod_node_selector", - "'{0}'".format( - json.dumps(self._saved_locals["k8s_etcd_pod_node_selector"]) - ), + "--k8s_vineyard_daemonset", + str(self._saved_locals["k8s_vineyard_daemonset"]), ] ) + if self._saved_locals["k8s_engine_pod_node_selector"] is not None: - cmd.extend( + args.extend( [ "--k8s_engine_pod_node_selector", - "'{0}'".format( - json.dumps(self._saved_locals["k8s_engine_pod_node_selector"]) - ), + f"{self.base64_encode(json.dumps(self._saved_locals['k8s_engine_pod_node_selector']))}", ] ) - return ["-c", " ".join(cmd)] + print(args) + return args def _create_services(self): self._create_coordinator() def _waiting_for_services_ready(self): - deployment = self._app_api.read_namespaced_deployment_status( + response = self._app_api.read_namespaced_deployment_status( namespace=self._namespace, name=self._coordinator_name ) # get deployment pods - selector = "" - for k, v in deployment.spec.selector.match_labels.items(): - selector += k + "=" + v + "," - selector = selector[:-1] + match_labels = response.spec.selector.match_labels + selector = ",".join([f"{k}={v}" for k, v in match_labels.items()]) pods = self._core_api.list_namespaced_pod( namespace=self._namespace, label_selector=selector ) - - for pod in pods.items: - self._coordinator_pods_watcher.append( - KubernetesPodWatcher( - self._api_client, - self._namespace, - pod, - self._coordinator_container_name, - ) - ) - self._coordinator_pods_watcher[-1].start() + assert len(pods.items) == 1, "coordinator deployment should have only one pod" + pod = pods.items[0] + self._coordinator_pods_watcher = KubernetesPodWatcher( + api_client=self._api_client, + namespace=self._namespace, + pod=pod, + container="coordinator", + ) + self._coordinator_pods_watcher.start() if wait_for_deployment_complete( api_client=self._api_client, @@ -536,29 +474,25 @@ def _waiting_for_services_ready(self): name=self._coordinator_name, timeout_seconds=self._saved_locals["timeout_seconds"], ): - for pod_watcher in self._coordinator_pods_watcher: - pod_watcher.stop() + self._coordinator_pods_watcher.stop() def _try_to_get_coordinator_service_from_configmap(self): - config_map_name = "gs-coordinator-{}".format(self._instance_id) + config_map_name = f"gs-coordinator-{self._instance_id}" start_time = time.time() while True: try: - api_response = self._core_api.read_namespaced_config_map( + response = self._core_api.read_namespaced_config_map( name=config_map_name, namespace=self._namespace ) + return f"{response.data['ip']}:{response.data['port']}" except K8SApiException: pass - else: - return "{}:{}".format( - api_response.data["ip"], api_response.data["port"] - ) time.sleep(1) if time.time() - start_time > self._saved_locals["timeout_seconds"]: - raise TimeoutError("Gete coordinator service from configmap timeout") + raise TimeoutError("Get coordinator service from configmap timeout") def _get_coordinator_endpoint(self): - if self._saved_locals["k8s_service_type"] is None: + if self._service_type is None: # try to get endpoint from configmap return self._try_to_get_coordinator_service_from_configmap() @@ -567,26 +501,24 @@ def _get_coordinator_endpoint(self): api_client=self._api_client, namespace=self._namespace, name=self._coordinator_service_name, - service_type=self._saved_locals["k8s_service_type"], + service_type=self._service_type, ) return endpoints[0] def _dump_coordinator_failed_status(self): # Dump failed status even show_log is False + if self._coordinator_pods_watcher is None: + return if not gs_config.show_log: - for pod_watcher in self._coordinator_pods_watcher: - while True: - try: - message = pod_watcher.poll(timeout_seconds=3) - except queue.Empty: - pod_watcher.stop() - break - else: - logger.error(message, extra={"simple": True}) - else: - for pod_watcher in self._coordinator_pods_watcher: - pod_watcher.stop() + while True: + try: + message = self._coordinator_pods_watcher.poll(timeout_seconds=3) + logger.error(message, extra={"simple": True}) + except queue.Empty: + break + self._coordinator_pods_watcher.stop() + self._coordinator_pods_watcher = None def start(self): """Launch graphscope instance on kubernetes cluster. @@ -620,52 +552,59 @@ def start(self): def stop(self, wait=False): """Stop graphscope instance on kubernetes cluster. - Args: - wait: bool, optional - Waiting for delete. Defaults to False. - Raises: TimeoutError: Waiting for stop instance timeout when ``wait`` or ``_waiting_for_delete`` is True. """ - if not self._closed: - # delete resources created by graphscope inside namespace - # make sure delete permission resouces in the end - for target in reversed(self._resource_object): - delete_kubernetes_object( - api_client=self._api_client, - target=target, - wait=self._saved_locals["k8s_waiting_for_delete"], - timeout_seconds=self._saved_locals["timeout_seconds"], - ) - self._resource_object = [] - if self._delete_namespace: - # delete namespace - api = CoreV1Api(self._api_client) - try: - api.delete_namespace(self._namespace) - except K8SApiException: - # namespace already deleted. + # delete resources created by graphscope inside namespace + # make sure delete permission resources in the end + logger.info("Stopping coordinator") + for target in reversed(self._resource_object): + delete_kubernetes_object( + api_client=self._api_client, + target=target, + wait=self._saved_locals["k8s_waiting_for_delete"], + timeout_seconds=self._saved_locals["timeout_seconds"], + ) + self._resource_object = [] + if self._delete_namespace: + # delete namespace + api = CoreV1Api(self._api_client) + try: + api.delete_namespace(self._namespace) + self._delete_namespace = False + except K8SApiException as e: + if e.status == 404: # namespace already deleted. pass else: - if wait or self._saved_locals["k8s_waiting_for_delete"]: - start_time = time.time() - while True: - try: - api.read_namespace(self._namespace) - except K8SApiException as ex: - if ex.status != 404: - raise - break - else: - time.sleep(1) - if ( - self._saved_locals["timeout_seconds"] - and time.time() - start_time - > self._saved_locals["timeout_seconds"] - ): - logger.info( - "Deleting namespace %s timeout", self._namespace - ) - break - self._closed = True + raise + logger.info("Stopped coordinator") + + +if __name__ == "__main__": + from kubernetes import config as kube_config + + kube_config.load_kube_config() + client = kube_client.ApiClient() + namespace = "demo" + service_type = "NodePort" + num_workers = 2 + k8s_image_registry = "registry-vpc.cn-hongkong.aliyuncs.com" + k8s_image_repository = "graphscope" + # k8s_image_tag = "0.17.0" + k8s_image_tag = "siyuan" + k8s_image_pull_policy = "IfNotPresent" + + launcher = KubernetesClusterLauncher( + api_client=client, + k8s_namespace=namespace, + k8s_service_type=service_type, + num_workers=num_workers, + k8s_image_registry=k8s_image_registry, + k8s_image_repository=k8s_image_repository, + k8s_image_tag=k8s_image_tag, + k8s_image_pull_policy=k8s_image_pull_policy, + ) + launcher.start() + print(launcher._get_coordinator_endpoint()) + launcher.stop() diff --git a/python/graphscope/deploy/kubernetes/resource_builder.py b/python/graphscope/deploy/kubernetes/resource_builder.py index 5783d0c4116f..4d33ef6604bd 100644 --- a/python/graphscope/deploy/kubernetes/resource_builder.py +++ b/python/graphscope/deploy/kubernetes/resource_builder.py @@ -18,1265 +18,335 @@ import logging -import math -import os import sys +from kubernetes import client as kube_client + from graphscope.deploy.kubernetes.utils import parse_readable_memory from graphscope.framework.utils import get_tempdir logger = logging.getLogger("graphscope") -def _remove_nones(o): - return dict((k, v) for k, v in o.items() if v is not None) - - -def resolve_volume_builder(name, value): - """Resolve specified volume with value dict.""" - if "type" not in value or "field" not in value or "mounts" not in value: - logger.warning("Volume %s must contains 'type' 'field' and 'mounts'", name) - return None - return VolumeBuilder( - name=name, - volume_type=value["type"], - field=value["field"], - mounts_list=value["mounts"], - ) - - -class ConfigMapBuilder(object): - """Builder for k8s ConfigMap""" - - def __init__(self, name): - self._name = name - self._kvs = dict() - - def add_kv(self, key, value): - if value: - self._kvs[key] = value - - def add_simple_kvs(self, kvs): - for k, v in kvs.items() or (): - self.add_kv(k, v) - - def build(self): - return { - "kind": "ConfigMap", - "metadata": {"name": self._name}, - "data": self._kvs, - } - - -class NamespaceBuilder(object): - """Builder for k8s namespace.""" - - def __init__(self, name): - self._name = name - - def build(self): - return { - "kind": "Namespace", - "metadata": { - "name": self._name, - "labels": { - "name": self._name, - }, - }, - } - - -class LocalObjectRefBuilder(object): - """Builder for k8s LocalObjectReference.""" - - def __init__(self, name): - self._name = name - - def build(self): - return {"name": self._name} - - -class RoleBuilder(object): - """Builder for k8s RBAC roles.""" - - def __init__(self, name, namespace, api_groups, resources, verbs): - self._name = name - self._namespace = namespace - self._api_groups = api_groups.split(",") - self._resources = resources.split(",") - self._verbs = verbs.split(",") - - def build(self): - return { - "kind": "Role", - "metadata": {"name": self._name, "namespace": self._namespace}, - "rules": [ - { - "apiGroups": self._api_groups, - "resources": self._resources, - "verbs": self._verbs, - } - ], - } - - -class ClusterRoleBuilder(object): - """Builder for k8s RBAC roles.""" - - def __init__(self, name, api_groups, resources, verbs): - self._name = name - self._api_groups = api_groups.split(",") - self._resources = resources.split(",") - self._verbs = verbs.split(",") - - def build(self): - return { - "kind": "ClusterRole", - "metadata": {"name": self._name}, - "rules": [ - { - "apiGroups": self._api_groups, - "resources": self._resources, - "verbs": self._verbs, - } - ], - } - - -class RoleBindingBuilder(object): - """Builder for k8s RBAC role bindings.""" - - def __init__(self, name, namespace, role_name, service_account_name): - self._name = name - self._namespace = namespace - self._role_name = role_name - self._service_account_name = service_account_name - - def build(self): - return { - "kind": "RoleBinding", - "metadata": {"name": self._name, "namespace": self._namespace}, - "roleRef": { - "apiGroup": "rbac.authorization.k8s.io", - "kind": "Role", - "name": self._role_name, - }, - "subjects": [ - { - "kind": "ServiceAccount", - "name": self._service_account_name, - "namespace": self._namespace, - } - ], - } - - -class ClusterRoleBindingBuilder(object): - """Builder for k8s RBAC cluster role bindings.""" - - def __init__(self, name, namespace, cluster_role_name, service_account_name): - self._name = name - self._namespace = namespace - self._cluster_role_name = cluster_role_name - self._service_account_name = service_account_name - - def build(self): - return { - "kind": "ClusterRoleBinding", - "metadata": {"name": self._name}, - "roleRef": { - "apiGroup": "rbac.authorization.k8s.io", - "kind": "ClusterRole", - "name": self._cluster_role_name, - }, - "subjects": [ - { - "kind": "ServiceAccount", - "name": self._service_account_name, - "namespace": self._namespace, - } - ], - } - - -class ServiceBuilder(object): - """Builder for k8s services.""" - - _annotations = { - "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-health-check-type": "tcp", - "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-health-check-connect-timeout": "8", - "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-healthy-threshold": "2", - "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-unhealthy-threshold": "2", - "service.beta.kubernetes.io/alibaba-cloud-loadbalancer-health-check-interval": "1", - } - - def __init__( - self, - name, - service_type, - selector, - port, - target_port=None, - node_port=None, - protocol=None, - external_traffic_policy=None, - ): - self._name = name - self._type = service_type - self._protocol = protocol or "TCP" - self._selector = selector - self._port = port - self._target_port = target_port - self._node_port = node_port - self._external_traffic_policy = external_traffic_policy - - def build(self): - if isinstance(self._port, (range, list, tuple)): - ports = [] - for idx, port in enumerate(self._port): - ports.append( - _remove_nones( - { - "name": "%s-%d" % (self._name, idx), - "protocol": self._protocol, - "port": port, - } - ) - ) - else: - ports = [ - _remove_nones( - { - "protocol": self._protocol, - "port": self._port, - "targetPort": self._target_port, - "nodePort": self._node_port, - } - ), - ] - - return { - "kind": "Service", - "metadata": { - "annotations": self._annotations, - "name": self._name, - "labels": self._selector, - }, - "spec": _remove_nones( - { - "type": self._type, - "selector": self._selector, - "ports": ports, - "externalTrafficPolicy": self._external_traffic_policy, - } - ), - } - - -class ContainerEnvBuilder(object): - """Builder for k8s container environments.""" - - def __init__(self, name, value): - self._name = name - self._value = value - - def build(self): - result = dict(name=self._name) - result["value"] = str(self._value) - return result - - -class ContainerFieldRefEnvBuilder(object): - """Builder for k8s container environments.""" - - def __init__(self, name, field): - self._name = name - self._field = field - - def build(self): - result = dict(name=self._name) - result["valueFrom"] = { - "fieldRef": { - "fieldPath": self._field, - } - } - return result - - -BASE_MACHINE_ENVS = { - "MY_NODE_NAME": "spec.nodeName", - "MY_POD_NAME": "metadata.name", - "MY_POD_NAMESPACE": "metadata.namespace", - "MY_POD_IP": "status.podIP", - "MY_HOST_NAME": "status.podIP", -} - - -class PortBuilder(object): - """Builder for k8s container port definition.""" - - def __init__(self, container_port): - self._container_port = int(container_port) - - def build(self): - return { - "containerPort": self._container_port, - } - - -class ResourceBuilder(object): - """Builder for k8s computation resources.""" - - def __init__(self, cpu, memory): - self._cpu = cpu - self._memory = parse_readable_memory(memory) - - def build(self): - return { - "cpu": float(self._cpu), - "memory": str(self._memory), - } - - -class VolumeBuilder(object): - """Builder for k8s volumes.""" - - def __init__(self, name, volume_type, field, mounts_list): - self._name = name - self._type = volume_type - self._field = field - self._mounts_list = mounts_list - if not isinstance(self._mounts_list, list): - self._mounts_list = [self._mounts_list] - - for mount in self._mounts_list: - mount["name"] = self._name - - def build(self): - return {"name": self._name, self._type: self._field} - - def build_mount(self): - return self._mounts_list - - -class HttpHeaderBuilder(object): - """Builder for k8s http header.""" - - def __init__(self, name, value): - self._name = name - self._value = value - - def build(self): - result = dict(name=self._name) - result["value"] = str(self._value) - return result - - -class ProbeBuilder(object): - """Builder for k8s liveness and readiness probes.""" - - def __init__( - self, - initial_delay=10, - period=2, - timeout=1, - success_thresh=None, - failure_thresh=None, - ): - self._initial_delay = initial_delay - self._period = period - self._timeout = timeout - self._success_thresh = success_thresh - self._failure_thresh = failure_thresh - - def build(self): - return _remove_nones( - { - "initialDelaySeconds": self._initial_delay, - "periodSeconds": self._period, - "timeoutSeconds": self._timeout, - "successThreshold": self._success_thresh, - "failureThreshold": self._failure_thresh, - } +class ResourceBuilder: + @staticmethod + def get_configmap(name, kvs): + metadata = kube_client.V1ObjectMeta(name=name) + configmap = kube_client.V1ConfigMap(metadata=metadata, data=kvs) + return configmap + + @staticmethod + def get_role(name, namespace, api_groups, resources, verbs, labels): + metadata = kube_client.V1ObjectMeta(name=name, namespace=namespace) + metadata.labels = labels + rule = kube_client.V1PolicyRule( + api_groups=api_groups.split(','), + resources=resources.split(','), + verbs=verbs.split(','), ) - - -class ExecProbeBuilder(ProbeBuilder): - """Builder for k8s executing probes.""" - - def __init__(self, command, *args, **kwargs): - super().__init__(*args, **kwargs) - self._command = command - - def build(self): - result = {"exec": {"command": self._command}} - result.update(super().build()) - return result - - -class TcpProbeBuilder(ProbeBuilder): - """Builder for k8s tcp probes.""" - - def __init__(self, port, *args, **kwargs): - super().__init__(*args, **kwargs) - self._port = port - - def build(self): - result = {"tcpSocket": {"port": self._port}} - result.update(super().build()) - return result - - -class HttpProbeBuilder(ProbeBuilder): - """Builder for k8s http probes.""" - - def __init__(self, path, port, http_headers, *args, **kwargs): - super().__init__(*args, **kwargs) - self._path = path - self._port = port - self._http_headers = http_headers - - def build(self): - result = { - "httpGet": { - "path": self._path, - "port": self._port, - "httpHeaders": [h.build() for h in self._http_headers], - } - } - result.update(super().build()) - return result - - -class DeploymentBuilder(object): - """Base Builder for k8s deployment.""" - - def __init__( - self, - name, - labels, - replicas, - image_pull_policy, - ): - self._name = name - self._labels = labels - self._replicas = replicas - self._image_pull_policy = image_pull_policy - - self._containers = [] - self._volumes = [] - self._envs = dict() - self._image_pull_secrets = [] - self._host_network = False - self._node_selector = dict() - - self.add_field_envs(BASE_MACHINE_ENVS) - - @property - def host_network(self): - return self._host_network - - @host_network.setter - def host_network(self, value): - self._host_network = value - - def set_image_pull_policy(self, policy): - self._image_pull_policy = policy - - def add_env(self, name, value=None): - if value: - self._envs[name] = ContainerEnvBuilder(name, value) - - def add_field_env(self, name, field=None): - if field: - self._envs[name] = ContainerFieldRefEnvBuilder(name, field) - - def add_simple_envs(self, envs): - for k, v in envs.items() or (): - self.add_env(k, v) - - def add_field_envs(self, envs): - for k, v in envs.items() or (): - self.add_field_env(k, v) - - def add_container(self, ctn): - self._containers.append(ctn) - - def add_volume(self, vol): - self._volumes.append(vol) - - def add_image_pull_secret(self, name): - self._image_pull_secrets.append(LocalObjectRefBuilder(name)) - - def add_pod_node_selector(self, node_selector): - if node_selector: - for k, v in node_selector.items(): - self._node_selector[k] = v - - def build_template_spec(self): - result = { - "hostNetwork": self._host_network, - "containers": [ctn for ctn in self._containers], - "volumes": [vol.build() for vol in self._volumes] or None, - "imagePullSecrets": [ips.build() for ips in self._image_pull_secrets] - or None, - "nodeSelector": self._node_selector or None, - } - return dict((k, v) for k, v in result.items() if v) - - def build_selector(self): - result = {"matchLabels": self._labels} - return result - - def build(self): - return { - "kind": "Deployment", - "metadata": { - "name": self._name, - }, - "spec": { - "replicas": int(self._replicas), - "selector": self.build_selector(), - "template": { - "metadata": { - "labels": self._labels, - }, - "spec": self.build_template_spec(), - }, - }, - } - - -class ReplicaSetBuilder(object): - """Base Builder for k8s ReplicaSet.""" - - def __init__(self, name, labels, replicas, image_pull_policy): - self._name = name - self._labels = labels - self._replicas = replicas - self._image_pull_policy = image_pull_policy - - self._containers = [] - self._volumes = [] - self._envs = dict() - self._annotations = dict() - self._image_pull_secrets = [] - self._host_network = False - self._node_selector = dict() - - self.add_field_envs(BASE_MACHINE_ENVS) - - @property - def host_network(self): - return self._host_network - - @host_network.setter - def host_network(self, value): - self._host_network = value - - def set_image_pull_policy(self, policy): - self._image_pull_policy = policy - - def add_annotation(self, name, value): - self._annotations[name] = value - - def add_env(self, name, value=None): - if value: - self._envs[name] = ContainerEnvBuilder(name, value) - - def add_field_env(self, name, field=None): - if field: - self._envs[name] = ContainerFieldRefEnvBuilder(name, field) - - def add_simple_envs(self, envs): - for k, v in envs.items() or (): - self.add_env(k, v) - - def add_field_envs(self, envs): - for k, v in envs.items() or (): - self.add_field_env(k, v) - - def add_container(self, ctn): - self._containers.append(ctn) - - def add_volume(self, vol): - self._volumes.append(vol) - - def add_image_pull_secret(self, name): - self._image_pull_secrets.append(LocalObjectRefBuilder(name)) - - def add_pod_node_selector(self, node_selector): - if node_selector: - for k, v in node_selector.items(): - self._node_selector[k] = v - - def build_pod_spec(self): - result = { - "hostNetwork": self._host_network, - "containers": [ctn for ctn in self._containers], - "volumes": [vol.build() for vol in self._volumes] or None, - "imagePullSecrets": [ips.build() for ips in self._image_pull_secrets] - or None, - "nodeSelector": self._node_selector or None, - } - return dict((k, v) for k, v in result.items() if v) - - def build_selector(self): - result = {"matchLabels": self._labels} - return result - - def build(self): - return { - "kind": "ReplicaSet", - "metadata": { - "name": self._name, - }, - "spec": { - "replicas": int(self._replicas), - "selector": self.build_selector(), - "template": { - "metadata": { - "labels": self._labels, - "annotations": self._annotations, - }, - "spec": self.build_pod_spec(), - }, - }, - } - - -class GSEngineBuilder(ReplicaSetBuilder): - """Builder for graphscope analytical engine.""" - - _vineyard_requests_cpu = 0.2 - _vineyard_requests_mem = "128Mi" - - _engine_requests_cpu = 0.2 - _engine_requests_mem = "1Gi" - - _mars_worker_requests_cpu = 0.2 - _mars_worker_requests_mem = "1Gi" - _mars_scheduler_requests_cpu = 0.2 - _mars_scheduler_requests_mem = "1Gi" - - def __init__(self, name, labels, num_workers, image_pull_policy): - self._name = name - self._labels = labels - self._num_workers = num_workers - self._image_pull_policy = image_pull_policy - self._ipc_socket_file = os.path.join( - get_tempdir(), "vineyard_workspace", "vineyard.sock" + role = kube_client.V1Role(metadata=metadata, rules=[rule]) + return role + + @staticmethod + def get_cluster_role(name, api_groups, resources, verbs, labels): + metadata = kube_client.V1ObjectMeta(name=name, labels=labels) + rule = kube_client.V1PolicyRule( + api_groups=api_groups.split(','), + resources=resources.split(','), + verbs=verbs.split(','), ) - super().__init__( - self._name, self._labels, self._num_workers, self._image_pull_policy - ) - - def add_vineyard_container( - self, - name, - image, - cpu, - mem, - shared_mem, - preemptive, - etcd_endpoints, - port, - **kwargs - ): - vineyard_command = " ".join( - [ - sys.executable, - "-m", - "vineyard", - "--size=%s" % str(shared_mem), - '--etcd_endpoint="%s"' % (";".join(etcd_endpoints),), - "--socket=%s" % self._ipc_socket_file, - "--etcd_prefix=vineyard", - ] + role = kube_client.V1ClusterRole(metadata=metadata, rules=[rule]) + return role + + @staticmethod + def get_role_binding(name, namespace, role_name, service_account_name, labels): + metadata = kube_client.V1ObjectMeta(name=name, namespace=namespace) + metadata.labels = labels + role_ref = kube_client.V1RoleRef( + kind="Role", name=role_name, api_group="rbac.authorization.k8s.io" ) - commands = [] - commands.append( - "while ! curl --output /dev/null --silent --head --connect-timeout 1 %s" - % etcd_endpoints[0] - ) - commands.append("do sleep 1 && echo -n .") - commands.append("done") - commands.append(vineyard_command) - cmd = ["bash", "-c", "%s" % ("; ".join(commands),)] - - resources_dict = { - "requests": ResourceBuilder( - self._vineyard_requests_cpu, self._vineyard_requests_mem - ).build() - if preemptive - else ResourceBuilder(cpu, mem).build(), - "limits": ResourceBuilder(cpu, mem).build(), - } - - post_start_command = kwargs.pop("post_start_command", None) - pre_stop_command = kwargs.pop("pre_stop_command", None) - lifecycle_dict = _remove_nones( - { - "postStart": { - "exec": {"command": post_start_command}, - } - if post_start_command - else None, - "preStop": { - "exec": {"command": pre_stop_command}, - } - if pre_stop_command - else None, - } - ) - - volumeMounts = [] - for vol in self._volumes: - for vol_mount in vol.build_mount(): - volumeMounts.append(vol_mount) - - super().add_container( - _remove_nones( - { - "command": cmd, - "env": [env.build() for env in self._envs.values()] or None, - "image": image, - "name": name, - "imagePullPolicy": self._image_pull_policy, - "resources": dict((k, v) for k, v in resources_dict.items() if v) - or None, - "ports": [PortBuilder(port).build()], - "volumeMounts": volumeMounts or None, - "livenessProbe": None, - "readinessProbe": None, - "lifecycle": lifecycle_dict or None, - } - ) + subject = kube_client.V1Subject( + kind="ServiceAccount", name=service_account_name, namespace=namespace ) - - def add_engine_container(self, name, image, cpu, mem, preemptive, **kwargs): - cmd = kwargs.pop("cmd", None) - args = kwargs.pop("args", None) - - resources_dict = { - "requests": ResourceBuilder( - self._engine_requests_cpu, self._engine_requests_mem - ).build() - if preemptive - else ResourceBuilder(cpu, mem).build(), - "limits": ResourceBuilder(cpu, mem).build(), - } - - post_start_command = kwargs.pop("post_start_command", None) - pre_stop_command = kwargs.pop("pre_stop_command", None) - lifecycle_dict = _remove_nones( - { - "postStart": { - "exec": {"command": post_start_command}, - } - if post_start_command - else None, - "preStop": { - "exec": {"command": pre_stop_command}, - } - if pre_stop_command - else None, - } + role_binding = kube_client.V1RoleBinding( + metadata=metadata, role_ref=role_ref, subjects=[subject] ) + return role_binding - readiness_cmd = [ - "/bin/bash", - "-c", - "ls %s 2>/dev/null" % self._ipc_socket_file, - ] - readiness_probe = ExecProbeBuilder(readiness_cmd) - - # ports range in 8000~9000 will be open if `ports ` param missing. - ports = kwargs.pop("ports", [i for i in range(8000, 9000)]) - if not isinstance(ports, list): - ports = [ports] - - volumeMounts = [] - for vol in self._volumes: - for vol_mount in vol.build_mount(): - volumeMounts.append(vol_mount) - - super().add_container( - _remove_nones( - { - "command": cmd, - "args": args, - "env": [env.build() for env in self._envs.values()] or None, - "image": image, - "name": name, - "imagePullPolicy": self._image_pull_policy, - "resources": dict((k, v) for k, v in resources_dict.items() if v) - or None, - "ports": [PortBuilder(port).build() for port in ports], - "volumeMounts": volumeMounts or None, - "livenessProbe": None, - "readinessProbe": readiness_probe.build(), - "lifecycle": lifecycle_dict or None, - } - ) + @staticmethod + def get_cluster_role_binding(name, namespace, role_name, service_account_name, labels): + metadata = kube_client.V1ObjectMeta(name=name, labels=labels) + role_ref = kube_client.V1RoleRef( + kind="ClusterRole", name=role_name, api_group="rbac.authorization.k8s.io" ) - super().add_annotation("kubectl.kubernetes.io/default-container", name) - - def add_mars_worker_container( - self, name, image, cpu, mem, preemptive, port, scheduler_endpoint - ): - # compute n cpu, to avoid mars worker launches too many actors - if isinstance(cpu, str) and cpu[-1] == "m": - n_cpu = math.ceil(int("200m"[:-1]) / 1000) - if isinstance(cpu, (int, float)): - n_cpu = math.ceil(cpu) - else: - # by default: 1 - n_cpu = 1 - - cmd = [ - "while ! ls $VINEYARD_IPC_SOCKET 2>/dev/null; do sleep 1 && echo -n .; done", - ";", - 'echo \'"@inherits": "@mars/deploy/oscar/base_config.yml"\' > /tmp/mars-on-vineyard.yml', - ";", - 'echo "storage:" >> /tmp/mars-on-vineyard.yml', - ";", - 'echo " backends: [vineyard]" >> /tmp/mars-on-vineyard.yml', - ";", - 'echo " vineyard:" >> /tmp/mars-on-vineyard.yml', - ";", - 'echo " vineyard_socket: $VINEYARD_IPC_SOCKET" >> /tmp/mars-on-vineyard.yml', - ";", - "cat /tmp/mars-on-vineyard.yml", - ";", - "python3", - "-m", - "mars.deploy.oscar.worker", - "--n-cpu=%d" % n_cpu, - "--endpoint=$MY_POD_IP:%s" % port, - "--supervisors=%s" % scheduler_endpoint, - "--log-level=DEBUG", - "--config-file=/tmp/mars-on-vineyard.yml", - ] - cmd = ["bash", "-c", " ".join(cmd)] - - resources_dict = { - "requests": ResourceBuilder( - self._mars_worker_requests_cpu, self._mars_worker_requests_mem - ).build() - if preemptive - else ResourceBuilder(cpu, mem).build(), - "limits": ResourceBuilder(cpu, mem).build(), - } - - volumeMounts = [] - for vol in self._volumes: - for vol_mount in vol.build_mount(): - volumeMounts.append(vol_mount) - - probe = TcpProbeBuilder(port=port, timeout=15, period=10, failure_thresh=8) - - super().add_container( - _remove_nones( - { - "command": cmd, - "env": [env.build() for env in self._envs.values()] or None, - "image": image, - "name": name, - "imagePullPolicy": self._image_pull_policy, - "resources": dict((k, v) for k, v in resources_dict.items() if v) - or None, - "ports": [PortBuilder(port).build()], - "volumeMounts": volumeMounts or None, - "livenessProbe": None, - "readinessProbe": probe.build(), - } - ) + subject = kube_client.V1Subject( + kind="ServiceAccount", name=service_account_name, namespace=namespace ) - - def add_mars_scheduler_container( - self, name, image, cpu, mem, preemptive, port, web_port - ): - cmd = [ - "while ! ls $VINEYARD_IPC_SOCKET 2>/dev/null; do sleep 1 && echo -n .; done", - ";", - 'echo \'"@inherits": "@mars/deploy/oscar/base_config.yml"\' > /tmp/mars-on-vineyard.yml', - ";", - 'echo "storage:" >> /tmp/mars-on-vineyard.yml', - ";", - 'echo " backends: [vineyard]" >> /tmp/mars-on-vineyard.yml', - ";", - 'echo " vineyard:" >> /tmp/mars-on-vineyard.yml', - ";", - 'echo " vineyard_socket: $VINEYARD_IPC_SOCKET" >> /tmp/mars-on-vineyard.yml', - ";", - "cat /tmp/mars-on-vineyard.yml", - ";", - "python3", - "-m", - "mars.deploy.oscar.supervisor", - "--endpoint=$MY_POD_IP:%s" % port, - "--web-port=%s" % web_port, - "--log-level=DEBUG", - "--config-file=/tmp/mars-on-vineyard.yml", - ] - cmd = ["bash", "-c", " ".join(cmd)] - - resources_dict = { - "requests": ResourceBuilder( - self._mars_scheduler_requests_cpu, self._mars_scheduler_requests_mem - ).build() - if preemptive - else ResourceBuilder(cpu, mem).build(), - "limits": ResourceBuilder(cpu, mem).build(), - } - - volumeMounts = [] - for vol in self._volumes: - for vol_mount in vol.build_mount(): - volumeMounts.append(vol_mount) - - probe = TcpProbeBuilder(port=port, timeout=15, period=10, failure_thresh=8) - - super().add_container( - _remove_nones( - { - "command": cmd, - "env": [env.build() for env in self._envs.values()] or None, - "image": image, - "name": name, - "imagePullPolicy": self._image_pull_policy, - "resources": dict((k, v) for k, v in resources_dict.items() if v) - or None, - "ports": [PortBuilder(port).build(), PortBuilder(web_port).build()], - "volumeMounts": volumeMounts or None, - "livenessProbe": None, - "readinessProbe": probe.build(), - } - ) + role_binding = kube_client.V1ClusterRoleBinding( + metadata=metadata, role_ref=role_ref, subjects=[subject] ) - - super().add_annotation("kubectl.kubernetes.io/default-container", name) - - def add_engine_pod_node_selector(self, node_selector): - if node_selector: - super().add_pod_node_selector(node_selector) - - -class PodBuilder(object): - """Base builder for k8s pod.""" - - def __init__( - self, - name, - labels, - hostname=None, - subdomain=None, - restart_policy="Never", - node_selector=None, - ): - self._name = name - self._labels = labels - self._hostname = hostname - self._subdomain = subdomain - self._restart_policy = restart_policy - - self._containers = [] - self._image_pull_secrets = [] - self._volumes = [] - if node_selector: - self._node_selector = node_selector - else: - self._node_selector = dict() - - def add_volume(self, vol): - if isinstance(vol, list): - self._volumes.extend(vol) - else: - self._volumes.append(vol) - - def add_container(self, ctn): - self._containers.append(ctn) - - def add_image_pull_secret(self, name): - self._image_pull_secrets.append(LocalObjectRefBuilder(name)) - - def build_pod_spec(self): - return _remove_nones( - { - "hostname": self._hostname, - "subdomain": self._subdomain, - "containers": [ctn for ctn in self._containers], - "volumes": [vol.build() for vol in self._volumes] or None, - "imagePullSecrets": [ips.build() for ips in self._image_pull_secrets] - or None, - "restartPolicy": self._restart_policy, - "nodeSelector": self._node_selector or None, - } + return role_binding + + @staticmethod + def get_tcp_probe(port, timeout=15, period=10, failure_threshold=8): + return kube_client.V1Probe( + tcp_socket=kube_client.V1TCPSocketAction(port=port), + timeout_seconds=timeout, + period_seconds=period, + failure_threshold=failure_threshold, ) - def build(self): - return { - "kind": "Pod", - "metadata": {"name": self._name, "labels": self._labels}, - "spec": self.build_pod_spec(), - } - + @staticmethod + def get_exec_action(command): + return kube_client.V1ExecAction(command=command) -class GSEtcdBuilder(object): - """Builder for graphscope etcd.""" + @staticmethod + def get_lifecycle_handler(_exec=None, http_get=None, tcp_socket=None): + handler = kube_client.V1LifecycleHandler(_exec=_exec, http_get=http_get, tcp_socket=tcp_socket) + return handler - _requests_cpu = 0.2 - _requests_mem = "128Mi" - - def __init__( - self, - name_prefix, - container_name, - service_name, - image, - cpu, - mem, - preemptive, - listen_peer_service_port, - listen_client_service_port, - labels, - image_pull_policy, - num_pods=3, - restart_policy="Always", - image_pull_secrets=None, - max_txn_ops=1024000, - ): - self._name_prefix = name_prefix - self._container_name = container_name - self._service_name = service_name - self._image = image - self._cpu = cpu - self._mem = mem - self._preemptive = preemptive - self._listen_peer_service_port = listen_peer_service_port - self._listen_client_service_port = listen_client_service_port - self._labels = labels - self._image_pull_policy = image_pull_policy - self._num_pods = num_pods - self._restart_policy = restart_policy - self._image_pull_secrets = image_pull_secrets - self._max_txn_ops = 1024000 - self._node_selector = dict() + @staticmethod + def get_lifecycle(post_start=None, pre_stop=None): + return kube_client.V1Lifecycle(post_start=post_start, pre_stop=pre_stop) - self._envs = dict() - self._volumes = [] - - def add_volume(self, vol): - if isinstance(vol, list): - self._volumes.extend(vol) - else: - self._volumes.append(vol) - - def add_env(self, name, value=None): - self._envs[name] = ContainerEnvBuilder(name, value) - - def add_simple_envs(self, envs): - for k, v in envs.items() or (): - self.add_env(k, v) - - def build(self): + @staticmethod + def get_image_pull_secrets(image_pull_secrets): """ - Returns: a list of :class:`PodBuilder`. + for name in self._image_pull_secrets: + engine_builder.add_image_pull_secret(name) """ - pods_name = [] - initial_cluster = "" - for i in range(self._num_pods): - name = "%s-%s" % (self._name_prefix, str(i)) - pods_name.append(name) - initial_cluster += "%s=http://%s:%s," % ( - name, - name, - self._listen_peer_service_port, - ) - # drop last comma - initial_cluster = initial_cluster[0:-1] + local_object_refs = [] + for name in image_pull_secrets: + local_object_refs.append(kube_client.V1LocalObjectReference(name=name)) + return local_object_refs - pods_builders, svc_builders = [], [] - for _, name in enumerate(pods_name): - pod_labels = {"etcd_name": name} - pod_builder = PodBuilder( - name=name, - labels={**self._labels, **pod_labels}, - hostname=name, - subdomain=self._service_name, - restart_policy=self._restart_policy, - node_selector=self._node_selector, - ) + @staticmethod + def get_node_selector(node_selector): + return node_selector - # volumes - pod_builder.add_volume(self._volumes) - - cmd = [ - "etcd", - "--name", - name, - "--max-txn-ops=%s" % self._max_txn_ops, - "--initial-advertise-peer-urls", - "http://%s:%s" % (name, self._listen_peer_service_port), - "--advertise-client-urls", - "http://%s:%s" % (name, self._listen_client_service_port), - "--data-dir=/var/lib/etcd", - "--listen-client-urls=http://0.0.0.0:%s" - % self._listen_client_service_port, - "--listen-peer-urls=http://0.0.0.0:%s" % self._listen_peer_service_port, - "--initial-cluster", - initial_cluster, - "--initial-cluster-state", - "new", - ] - - resources_dict = { - "requests": ResourceBuilder( - self._requests_cpu, self._requests_mem - ).build() - if self._preemptive - else ResourceBuilder(self._cpu, self._mem).build(), - "limits": ResourceBuilder(self._cpu, self._mem).build(), - } - - volumeMounts = [] - for vol in self._volumes: - for vol_mount in vol.build_mount(): - volumeMounts.append(vol_mount) - - pod_builder.add_container( - _remove_nones( - { - "command": cmd, - "env": [env.build() for env in self._envs.values()] or None, - "image": self._image, - "name": self._container_name, - "imagePullPolicy": self._image_pull_policy, - "resources": dict( - (k, v) for k, v in resources_dict.items() if v - ) - or None, - "ports": [ - PortBuilder(self._listen_peer_service_port).build(), - PortBuilder(self._listen_client_service_port).build(), - ], - "volumeMounts": volumeMounts or None, - "livenessProbe": self.build_liveness_probe().build(), - "readinessProbe": self.build_readiness_probe().build(), - "lifecycle": None, + @staticmethod + def get_user_defined_volumes(udf_volumes): + """ + { + name: { + "type": "", + "field": {}, # the keys are subject to volume type + "mounts": [ {"mountPath": "", "subPath": ""}, ... ] } - ) - ) - pods_builders.append(pod_builder) - - service_builder = ServiceBuilder( - name, - service_type="ClusterIP", - port=[ - self._listen_peer_service_port, - self._listen_client_service_port, - ], - selector=pod_labels, - ) - svc_builders.append(service_builder) - - return pods_builders, svc_builders - - def build_liveness_probe(self): - liveness_cmd = [ - "/bin/sh", - "-ec", - "ETCDCTL_API=3 etcdctl --endpoints=http://[127.0.0.1]:%s get foo" - % str(self._listen_client_service_port), - ] - return ExecProbeBuilder(liveness_cmd, timeout=15, period=10, failure_thresh=8) - - def build_readiness_probe(self): - return TcpProbeBuilder( - self._listen_peer_service_port, timeout=15, period=10, failure_thresh=8 - ) - - def add_etcd_pod_node_selector(self, node_selector): - if node_selector: - for k, v in node_selector.items(): - self._node_selector[k] = v - - -class GSCoordinatorBuilder(DeploymentBuilder): - """Builder for graphscope coordinator.""" - - _requests_cpu = 0.5 - _requests_mem = "512Mi" - - def __init__(self, name, labels, image_pull_policy, replicas=1): + } + """ + if not udf_volumes: + return [], [], [] + volumes, source_volume_mounts = [], [] + for name, value in udf_volumes.items(): + volume = kube_client.V1Volume(name=name) + field = value.get("field", {}) + if value['type'] == 'hostPath': + volume.host_path = kube_client.V1HostPathVolumeSource(path=field['path']) + if 'type' in field: + volume.host_path.type = field['type'] + elif value['type'] == 'emptyDir': + volume.empty_dir = kube_client.V1EmptyDirVolumeSource() + if 'medium' in field: + volume.empty_dir.medium = field['medium'] + if 'sizeLimit' in field: + volume.empty_dir.size_limit = field['sizeLimit'] + elif value['type'] == 'persistentVolumeClaim': + pvc = kube_client.V1PersistentVolumeClaimVolumeSource(claim_name=field['claimName']) + volume.persistent_volume_claim = pvc + if 'readOnly' in field: + volume.persistent_volume_claim.read_only = field['readOnly'] + elif value['type'] == 'configMap': + volume.config_map = kube_client.V1ConfigMapVolumeSource(name=field['name']) + elif value['type'] == 'secret': + volume.secret = kube_client.V1SecretVolumeSource(secret_name=field['name']) + else: + raise ValueError(f"Unsupported volume type: {value['type']}") + volume_mounts = [] + mounts_list = value['mounts'] + if not isinstance(mounts_list, list): + mounts_list = [value['mounts']] + for udf_mount in mounts_list: + volume_mount = kube_client.V1VolumeMount(name=name, mount_path=udf_mount['mountPath']) + if 'subPath' in udf_mount: + volume_mount.sub_path = udf_mount['subPath'] + if 'readOnly' in udf_mount: + volume_mount.read_only = udf_mount['readOnly'] + volume_mounts.append(volume_mount) + volumes.append(volume) + source_volume_mounts.extend(volume_mounts) + # Assume destination mounts are the same as source mounts + destination_volume_mounts = source_volume_mounts + return volumes, source_volume_mounts, destination_volume_mounts + + @staticmethod + def get_resources(requests, limits, preemptive=True): + resource_requirements = kube_client.V1ResourceRequirements() + if not preemptive and requests is not None: + resource_requirements.requests = requests + if limits is not None: + resource_requirements.limits = limits + return resource_requirements + + @staticmethod + def get_pod_spec(containers: [kube_client.V1Container], image_pull_secrets=None, node_selector=None, volumes=None): + pod_spec = kube_client.V1PodSpec(containers=containers) + if image_pull_secrets is not None and image_pull_secrets: + pod_spec.image_pull_secrets = ResourceBuilder.get_image_pull_secrets(image_pull_secrets) + if node_selector is not None and node_selector: + pod_spec.node_selector = ResourceBuilder.get_node_selector(node_selector) + if volumes is not None and volumes: + pod_spec.volumes = volumes + return pod_spec + + @staticmethod + def get_pod_template_spec(spec: kube_client.V1PodSpec, labels: dict): + pod_template_spec = kube_client.V1PodTemplateSpec() + pod_template_spec.spec = spec + pod_template_spec.metadata = kube_client.V1ObjectMeta(labels=labels) + return pod_template_spec + + @staticmethod + def get_deployment_spec(template, replicas, labels): + selector = kube_client.V1LabelSelector(match_labels=labels) + spec = kube_client.V1DeploymentSpec(selector=selector, template=template) + spec.replicas = replicas + return spec + + @staticmethod + def get_deployment(namespace, name, spec, labels): + deployment = kube_client.V1Deployment() + deployment.api_version = "apps/v1" + deployment.kind = "Deployment" + deployment.metadata = kube_client.V1ObjectMeta(name=name, labels=labels, namespace=namespace) + deployment.spec = spec + return deployment + + @staticmethod + def get_stateful_set_spec(template, replicas, labels, service_name): + selector = kube_client.V1LabelSelector(match_labels=labels) + spec = kube_client.V1StatefulSetSpec(selector=selector, template=template, service_name=service_name) + spec.replicas = replicas + return spec + + @staticmethod + def get_stateful_set(namespace, name, spec, labels): + statefulset = kube_client.V1StatefulSet() + statefulset.api_version = "apps/v1" + statefulset.kind = "StatefulSet" + statefulset.metadata = kube_client.V1ObjectMeta(name=name, labels=labels, namespace=namespace) + statefulset.spec = spec + return statefulset + + @staticmethod + def get_value_from_field_ref(name, field_path): + env = kube_client.V1EnvVar(name=name) + value_from = kube_client.V1EnvVarSource() + value_from.field_ref = kube_client.V1ObjectFieldSelector(field_path=field_path) + env.value_from = value_from + return env + + @staticmethod + def get_namespace(name): + namespace = kube_client.V1Namespace() + namespace.metadata = kube_client.V1ObjectMeta(name=name) + namespace.metadata.labels = {"kubernetes.io/metadata.name": name} + return namespace + + @staticmethod + def get_service_spec(type, ports, labels, external_traffic_policy): + service_spec = kube_client.V1ServiceSpec() + service_spec.type = type + service_spec.selector = labels + service_spec.ports = ports + if external_traffic_policy is not None: + service_spec.external_traffic_policy = external_traffic_policy + return service_spec + + @staticmethod + def get_service(namespace, name, service_spec, labels, annotations=None): + service = kube_client.V1Service() + service.api_version = "v1" + service.kind = "Service" + service.spec = service_spec + metadata = kube_client.V1ObjectMeta(namespace=namespace, name=name, labels=labels, annotations=annotations) + service.metadata = metadata + return service + + +class CoordinatorDeployment: + def __init__(self, namespace, name, image, args, labels, image_pull_secret, + image_pull_policy, node_selector, env, host_network, port=None): + self._replicas = 1 + self._namespace = namespace self._name = name + self._image = image + self._args = args self._labels = labels - self._replicas = replicas self._image_pull_policy = image_pull_policy - super().__init__( - self._name, self._labels, self._replicas, self._image_pull_policy - ) - - def add_coordinator_container(self, name, image, cpu, mem, preemptive, **kwargs): - cmd = kwargs.pop("cmd", None) - args = kwargs.pop("args", None) - module_name = kwargs.pop("module_name", "gscoordinator") - - resources_dict = { - "requests": ResourceBuilder(self._requests_cpu, self._requests_mem).build() - if preemptive - else ResourceBuilder(cpu, mem).build(), - "limits": ResourceBuilder(cpu, mem).build(), - } - - volumeMounts = [] - for vol in self._volumes: - for vol_mount in vol.build_mount(): - volumeMounts.append(vol_mount) - - pre_stop_command = [ - sys.executable, - "-m", - "{0}.hook.prestop".format(module_name), - ] - lifecycle_dict = _remove_nones( - { - "preStop": { - "exec": {"command": pre_stop_command}, - } - } - ) - - ports = kwargs.pop("ports", None) - if ports is not None and not isinstance(ports, list): - ports = [ports] - - super().add_container( - _remove_nones( - { - "command": cmd, - "args": args, - "env": [env.build() for env in self._envs.values()] or None, - "image": image, - "name": name, - "imagePullPolicy": self._image_pull_policy, - "resources": dict((k, v) for k, v in resources_dict.items() if v) - or None, - "ports": [PortBuilder(port).build() for port in ports] - if ports - else None, - "volumeMounts": volumeMounts or None, - "livenessProbe": None, - "readinessProbe": self.build_readiness_probe(ports[0]).build(), - "lifecycle": lifecycle_dict, - } - ) + self._image_pull_secret = image_pull_secret + self._env: dict = env + self._port = port + self._host_network = host_network + self._node_selector = node_selector + self._requests = {"cpu": 0.5, "memory": "512Mi"} + self._limits = {"cpu": 0.5, "memory": "512Mi"} + + def get_lifecycle(self): + pre_stop = ["/opt/rh/rh-python38/root/usr/bin/python3", "-m", "gscoordinator.hook.prestop"] + _exec = ResourceBuilder.get_exec_action(pre_stop) + lifecycle_handler = ResourceBuilder.get_lifecycle_handler(_exec) + lifecycle = ResourceBuilder.get_lifecycle(pre_stop=lifecycle_handler) + return lifecycle + + def get_coordinator_container(self): + resources = ResourceBuilder.get_resources(self._requests, self._limits) + lifecycle = self.get_lifecycle() + env = [kube_client.V1EnvVar(name=key, value=value) for key, value in self._env.items()] + container = kube_client.V1Container( + name="coordinator", + image=self._image, + image_pull_policy=self._image_pull_policy, + args=self._args, + resources=resources, + lifecycle=lifecycle, + env=env, ) - def add_coordinator_pod_node_selector(self, node_selector): - if node_selector: - super().add_pod_node_selector(node_selector) - - def build_readiness_probe(self, port): - return TcpProbeBuilder(port=port, timeout=15, period=10, failure_thresh=8) + if self._port is not None: + container_ports = [kube_client.V1ContainerPort(container_port=self._port)] + container_ports.append(kube_client.V1ContainerPort(container_port=8000)) + container.ports = container_ports + container.readiness_probe = ResourceBuilder.get_tcp_probe(port=self._port, + timeout=15, + period=1, + failure_threshold=20) + return container + + def get_coordinator_pod_spec(self): + container = self.get_coordinator_container() + pod_spec = ResourceBuilder.get_pod_spec(containers=[container], + image_pull_secrets=self._image_pull_secret, + node_selector=self._node_selector) + pod_spec.host_network = self._host_network + return pod_spec + + def get_coordinator_pod_template_spec(self): + spec = self.get_coordinator_pod_spec() + return ResourceBuilder.get_pod_template_spec(spec, self._labels) + + def get_coordinator_deployment_spec(self, replicas): + template = self.get_coordinator_pod_template_spec() + spec = ResourceBuilder.get_deployment_spec(template, replicas, self._labels) + return spec + + def get_coordinator_deployment(self): + spec = self.get_coordinator_deployment_spec(self._replicas) + return ResourceBuilder.get_deployment(self._namespace, self._name, spec, self._labels) + + def get_coordinator_service(self, service_type, port): + ports = [kube_client.V1ServicePort(name="coordinator", port=port)] + ports.append(kube_client.V1ServicePort(name="debug", port=8000)) + service_spec = ResourceBuilder.get_service_spec(service_type, ports, self._labels, None) + service = ResourceBuilder.get_service(self._namespace, self._name, service_spec, self._labels) + return service diff --git a/python/graphscope/deploy/kubernetes/utils.py b/python/graphscope/deploy/kubernetes/utils.py index 1ca62611d045..17ab1dee8aaf 100644 --- a/python/graphscope/deploy/kubernetes/utils.py +++ b/python/graphscope/deploy/kubernetes/utils.py @@ -104,7 +104,7 @@ def wait_for_deployment_complete(api_client, namespace, name, timeout_seconds=60 app_api = kube_client.AppsV1Api(api_client) start_time = time.time() while time.time() - start_time < timeout_seconds: - time.sleep(2) + time.sleep(1) response = app_api.read_namespaced_deployment_status( namespace=namespace, name=name ) @@ -117,10 +117,8 @@ def wait_for_deployment_complete(api_client, namespace, name, timeout_seconds=60 ): return True # check failed - selector = "" - for k, v in response.spec.selector.match_labels.items(): - selector += k + "=" + v + "," - selector = selector[:-1] + match_labels = response.spec.selector.match_labels + selector = ",".join([f"{k}={v}" for k, v in match_labels.items()]) pods = core_api.list_namespaced_pod( namespace=namespace, label_selector=selector ) @@ -249,7 +247,7 @@ def get_service_endpoints( # noqa: C901 service_type: str Service type. Valid options are NodePort, LoadBalancer and ClusterIP. timeout_seconds: int - Raise TimeoutError after waiting for this seconds, only used in LoadBalancer type. + Raise TimeoutError after the duration, only used in LoadBalancer type. Raises: TimeoutError: If the underlying cloud-provider doesn't support the LoadBalancer @@ -268,10 +266,7 @@ def get_service_endpoints( # noqa: C901 svc = core_api.read_namespaced_service(name=name, namespace=namespace) # get pods - selector = "" - for k, v in svc.spec.selector.items(): - selector += k + "=" + v + "," - selector = selector[:-1] + selector = ",".join([f"{k}={v}" for k, v in svc.spec.selector.items()]) pods = core_api.list_namespaced_pod(namespace=namespace, label_selector=selector) ips = [] @@ -285,19 +280,21 @@ def get_service_endpoints( # noqa: C901 elif service_type == "LoadBalancer": while True: svc = core_api.read_namespaced_service(name=name, namespace=namespace) - if svc.status.load_balancer.ingress is not None: - for ingress in svc.status.load_balancer.ingress: - if ingress.hostname is not None: - ips.append(ingress.hostname) - else: - ips.append(ingress.ip) - for port in svc.spec.ports: - if query_port is None or port.port == query_port: - ports.append(port.port) - break - time.sleep(1) - if time.time() - start_time > timeout_seconds: - raise TimeoutError("LoadBalancer service type is not supported yet.") + if svc.status.load_balancer.ingress is None: + if time.time() - start_time > timeout_seconds: + raise TimeoutError( + "LoadBalancer service type is not supported yet." + ) + time.sleep(1) + continue + for ingress in svc.status.load_balancer.ingress: + if ingress.hostname is not None: + ips.append(ingress.hostname) + else: + ips.append(ingress.ip) + for port in svc.spec.ports: + if query_port is None or port.port == query_port: + ports.append(port.port) elif service_type == "ClusterIP": ips.append(svc.spec.cluster_ip) for port in svc.spec.ports: @@ -306,15 +303,10 @@ def get_service_endpoints( # noqa: C901 else: raise K8sError("Service type {0} is not supported yet".format(service_type)) - # generate endpoint - endpoints = [] - if not ips or not ports: - raise K8sError("Get {0} service {1} failed.".format(service_type, name)) + raise K8sError(f"Get {service_type} service {name} failed.") - for ip in ips: - for port in ports: - endpoints.append("{0}:{1}".format(ip, port)) + endpoints = [f"{ip}:{port}" for ip in ips for port in ports] return endpoints @@ -366,7 +358,7 @@ def delete_kubernetes_object( # convert group name from DNS subdomain format to # python class name convention group = "".join(word.capitalize() for word in group.split(".")) - fcn_to_call = "{0}{1}Api".format(group, version.capitalize()) + fcn_to_call = f"{group}{version.capitalize()}Api" k8s_api = getattr(kube_client, fcn_to_call)( api_client ) # pylint: disable=not-callable diff --git a/python/graphscope/deploy/launcher.py b/python/graphscope/deploy/launcher.py index bcc1d99007b6..afb4822d4dba 100644 --- a/python/graphscope/deploy/launcher.py +++ b/python/graphscope/deploy/launcher.py @@ -35,7 +35,7 @@ def __init__(self): def coordinator_endpoint(self): if self._coordinator_endpoint is None: raise RuntimeError("Get None value of coordinator endpoint.") - return str(self._coordinator_endpoint) + return self._coordinator_endpoint @abstractmethod def type(self): diff --git a/python/graphscope/framework/utils.py b/python/graphscope/framework/utils.py index 8f99615cf8cf..40b1afc0ac01 100644 --- a/python/graphscope/framework/utils.py +++ b/python/graphscope/framework/utils.py @@ -218,7 +218,7 @@ def get_platform_info(): def _get_gcc_version(): gcc = shutil.which("gcc") if gcc is None: - raise RuntimeError("gcc command not found.") + return None return subprocess.check_output([gcc, "--version"], stderr=subprocess.STDOUT) platform_info = ( diff --git a/python/graphscope/nx/conftest.py b/python/graphscope/nx/conftest.py index f7439fa26e38..017360af8ec9 100644 --- a/python/graphscope/nx/conftest.py +++ b/python/graphscope/nx/conftest.py @@ -26,6 +26,7 @@ @pytest.fixture(scope="module") def graphscope_session(): graphscope.set_option(show_log=True) + graphscope.set_option(log_level="DEBUG") if os.environ.get("DEPLOYMENT", None) == "standalone": sess = graphscope.session(cluster_type="hosts", num_workers=1) else: diff --git a/python/graphscope/tests/conftest.py b/python/graphscope/tests/conftest.py index 81aa09a7d6cf..e9a07d1761ef 100644 --- a/python/graphscope/tests/conftest.py +++ b/python/graphscope/tests/conftest.py @@ -34,6 +34,7 @@ @pytest.fixture(scope="module") def graphscope_session(): graphscope.set_option(show_log=True) + graphscope.set_option(log_level="DEBUG") if os.environ.get("DEPLOYMENT", None) == "standalone": sess = graphscope.session(cluster_type="hosts", num_workers=1) else: diff --git a/python/graphscope/tests/kubernetes/test_demo_script.py b/python/graphscope/tests/kubernetes/test_demo_script.py index 34f7356680a7..c9b285949d96 100644 --- a/python/graphscope/tests/kubernetes/test_demo_script.py +++ b/python/graphscope/tests/kubernetes/test_demo_script.py @@ -48,27 +48,30 @@ def get_k8s_volumes(): return k8s_volumes -def get_gs_image_on_ci_env(): - if "GS_IMAGE" in os.environ: - return os.environ["GS_IMAGE"] - return gs_config.k8s_gs_image +def get_gs_registry_on_ci_env(): + if "GS_REGISTRY" in os.environ: + return os.environ["GS_REGISTRY"] + return gs_config.k8s_image_registry + + +def get_gs_tag_on_ci_env(): + if "GS_TAG" in os.environ: + return os.environ["GS_TAG"] + return gs_config.k8s_image_tag @pytest.fixture def gs_session(): - gs_image = get_gs_image_on_ci_env() sess = graphscope.session( num_workers=1, - k8s_gs_image=gs_image, + k8s_image_registry=get_gs_registry_on_ci_env(), + k8s_image_tag=get_gs_tag_on_ci_env(), k8s_coordinator_cpu=2, k8s_coordinator_mem="4Gi", k8s_vineyard_cpu=2, k8s_vineyard_mem="512Mi", k8s_engine_cpu=2, k8s_engine_mem="4Gi", - k8s_etcd_cpu=2, - k8s_etcd_mem="256Mi", - k8s_etcd_num_pods=2, vineyard_shared_mem="4Gi", k8s_volumes=get_k8s_volumes(), ) @@ -78,19 +81,16 @@ def gs_session(): @pytest.fixture def gs_session_distributed(): - gs_image = get_gs_image_on_ci_env() sess = graphscope.session( num_workers=2, - k8s_gs_image=gs_image, + k8s_image_registry=get_gs_registry_on_ci_env(), + k8s_image_tag=get_gs_tag_on_ci_env(), k8s_coordinator_cpu=2, k8s_coordinator_mem="4Gi", k8s_vineyard_cpu=2, k8s_vineyard_mem="1Gi", k8s_engine_cpu=2, k8s_engine_mem="4Gi", - k8s_etcd_cpu=1, - k8s_etcd_mem="256Mi", - k8s_etcd_num_pods=2, vineyard_shared_mem="4Gi", k8s_volumes=get_k8s_volumes(), ) @@ -223,10 +223,10 @@ def test_multiple_session(): [random.choice(string.ascii_lowercase) for _ in range(6)] ) - gs_image = get_gs_image_on_ci_env() sess = graphscope.session( num_workers=1, - k8s_gs_image=gs_image, + k8s_image_registry=get_gs_registry_on_ci_env(), + k8s_image_tag=get_gs_tag_on_ci_env(), k8s_volumes=get_k8s_volumes(), ) info = sess.info @@ -236,7 +236,8 @@ def test_multiple_session(): sess2 = graphscope.session( k8s_namespace=namespace, num_workers=2, - k8s_gs_image=gs_image, + k8s_image_registry=get_gs_registry_on_ci_env(), + k8s_image_tag=get_gs_tag_on_ci_env(), k8s_volumes=get_k8s_volumes(), ) diff --git a/python/graphscope/tests/kubernetes/test_resource_builder.py b/python/graphscope/tests/kubernetes/test_resource_builder.py deleted file mode 100644 index cdf59dffd901..000000000000 --- a/python/graphscope/tests/kubernetes/test_resource_builder.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Copyright 2020 Alibaba Group Holding Limited. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from graphscope.deploy.kubernetes.resource_builder import ReplicaSetBuilder - - -def test_replica_set_builder(): - labels = { - "app.kubernetes.io/name": "graphscope", - "app.kubernetes.io/component": "engine", - } - engine_builder = ReplicaSetBuilder( - name="engine", - labels=labels, - replicas=2, - image_pull_policy=None, - ) - - result = engine_builder.build() - - assert result["spec"]["template"]["metadata"]["annotations"] == {} - - name = "kubectl.kubernetes.io/default-container" - engine_builder.add_annotation(name, "engine") - result = engine_builder.build() - - assert result["spec"]["template"]["metadata"]["annotations"][name] == "engine" diff --git a/python/graphscope/tests/kubernetes/test_with_mars.py b/python/graphscope/tests/kubernetes/test_with_mars.py index a791a71622c4..2c1440e72d6c 100644 --- a/python/graphscope/tests/kubernetes/test_with_mars.py +++ b/python/graphscope/tests/kubernetes/test_with_mars.py @@ -39,18 +39,24 @@ def get_k8s_volumes(): return k8s_volumes -def get_gs_image_on_ci_env(): - if "GS_IMAGE" in os.environ: - return os.environ["GS_IMAGE"] - return gs_config.k8s_gs_image +def get_gs_registry_on_ci_env(): + if "GS_REGISTRY" in os.environ: + return os.environ["GS_REGISTRY"] + return gs_config.k8s_image_registry + + +def get_gs_tag_on_ci_env(): + if "GS_TAG" in os.environ: + return os.environ["GS_TAG"] + return gs_config.k8s_image_tag @pytest.fixture def gs_session(): - gs_image = get_gs_image_on_ci_env() sess = graphscope.session( num_workers=2, - k8s_gs_image=gs_image, + k8s_image_registry=get_gs_registry_on_ci_env(), + k8s_image_tag=get_gs_tag_on_ci_env(), k8s_coordinator_cpu=2, k8s_coordinator_mem="4Gi", k8s_vineyard_cpu=2, diff --git a/python/graphscope/tests/unittest/test_lazy.py b/python/graphscope/tests/unittest/test_lazy.py index 6d1c1082b487..31e186d0f7b9 100644 --- a/python/graphscope/tests/unittest/test_lazy.py +++ b/python/graphscope/tests/unittest/test_lazy.py @@ -24,6 +24,7 @@ import graphscope graphscope.set_option(show_log=True) +graphscope.set_option(log_level="DEBUG") from graphscope.dataset import load_p2p_network from graphscope.framework.loader import Loader diff --git a/python/graphscope/tests/unittest/test_scalability.py b/python/graphscope/tests/unittest/test_scalability.py index f604223f8bf5..23adee820fbc 100644 --- a/python/graphscope/tests/unittest/test_scalability.py +++ b/python/graphscope/tests/unittest/test_scalability.py @@ -29,6 +29,7 @@ def p2p_property_graph(num_workers, directed=True): data_dir = os.path.expandvars("${GS_TEST_DIR}/property") graphscope.set_option(show_log=True) + graphscope.set_option(log_level="DEBUG") sess = graphscope.session(num_workers=num_workers, cluster_type="hosts") graph = sess.g(directed=directed) graph = graph.add_vertices("{}/p2p-31_property_v_0".format(data_dir), "person") diff --git a/python/graphscope/tests/unittest/test_session.py b/python/graphscope/tests/unittest/test_session.py index dcbafc469a03..639bc38ef454 100644 --- a/python/graphscope/tests/unittest/test_session.py +++ b/python/graphscope/tests/unittest/test_session.py @@ -31,6 +31,7 @@ def setUpModule(): graphscope.set_option(show_log=True) + graphscope.set_option(log_level="DEBUG") @pytest.fixture diff --git a/tutorials/10_revisit_classification_on_citation_network_on_k8s.ipynb b/tutorials/10_revisit_classification_on_citation_network_on_k8s.ipynb index b821605c9891..c93707633f7e 100644 --- a/tutorials/10_revisit_classification_on_citation_network_on_k8s.ipynb +++ b/tutorials/10_revisit_classification_on_citation_network_on_k8s.ipynb @@ -47,11 +47,11 @@ "outputs": [], "source": [ "# Create a session on kubernetes cluster and\n", - "# mount dataset bucket to path \"/home/jovyan/datasets\" in pod.\n", + "# mount dataset bucket to path \"/datasets\" in pod.\n", "\n", "from graphscope.dataset import load_ogbn_mag\n", "\n", - "sess = graphscope.session(mount_dataset=\"/home/jovyan/datasets\")" + "sess = graphscope.session(with_dataset=True)" ] }, { @@ -91,7 +91,7 @@ "source": [ "# Load the obgn_mag dataset in \"sess\" as a graph\n", "\n", - "graph = load_ogbn_mag(sess, \"/home/jovyan/datasets/ogbn_mag_small/\")" + "graph = load_ogbn_mag(sess, \"/datasets/ogbn_mag_small/\")" ] }, { @@ -283,7 +283,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.5 (default, Jul 28 2020, 12:59:40) \n[GCC 9.3.0]" + }, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } } }, "nbformat": 4, diff --git a/tutorials/zh/11_revisit_classification_on_citation_network_on_k8s.ipynb b/tutorials/zh/11_revisit_classification_on_citation_network_on_k8s.ipynb index 462274a9cc3c..73131904cf3b 100644 --- a/tutorials/zh/11_revisit_classification_on_citation_network_on_k8s.ipynb +++ b/tutorials/zh/11_revisit_classification_on_citation_network_on_k8s.ipynb @@ -48,11 +48,11 @@ "outputs": [], "source": [ "# Create a session on kubernetes cluster and\n", - "# mount dataset bucket to path \"/home/jovyan/datasets\" in pod.\n", + "# mount dataset bucket to path \"/datasets\" in pod.\n", "\n", "from graphscope.dataset import load_ogbn_mag\n", "\n", - "sess = graphscope.session(mount_dataset=\"/home/jovyan/datasets\")" + "sess = graphscope.session(with_dataset=True)" ] }, { @@ -92,7 +92,7 @@ "source": [ "# Load the obgn_mag dataset in \"sess\" as a graph\n", "\n", - "graph = load_ogbn_mag(sess, \"/home/jovyan/datasets/ogbn_mag_small/\")" + "graph = load_ogbn_mag(sess, \"/datasets/ogbn_mag_small/\")" ] }, {