.github/workflows/system-tests-enterprise.yml

# Copyright 2023 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
name: System Tests Enterprise

on:
  push:
    branches:
      - '.+-system-tests'

  schedule:

    # * is a special character in YAML so you have to quote this string
    # Run the system tests every 8 hours (cron hours should divide 24 equally, otherwise there will be an overlap at the end of the day)
    - cron: '0 */8 * * *'

  workflow_dispatch:
    inputs:
      docker_registry:
        description: 'Docker registry to pull images from (default: ghcr.io/, use registry.hub.docker.com/ for docker hub)'
        required: true
        default: 'ghcr.io/'
      clean_resources_in_teardown:
        description: 'Clean resources created by test (like project) in each test teardown (default: true - perform clean)'
        required: true
        default: 'true'
        type: choice
        options:
          - 'true'
          - 'false'
      override_iguazio_version:
        description: 'Override the configured target system iguazio version (leave empty to resolve automatically)'
        required: false

concurrency: one-at-a-time

jobs:
  system-test-cleanup:
    name: System Test Cleanup
    runs-on: [ self-hosted, Linux ]
    container:
      image: ubuntu:latest
    timeout-minutes: 10
    # let's not run this on every fork, change to your fork when developing
    if: github.repository == 'mlrun/mlrun' || github.event_name == 'workflow_dispatch'

    steps:
    - uses: actions/checkout@v3
    - name: Install dependencies
      run: |
        apt-get update -qqy && apt-get install -y sshpass
    - name: cleanup docker images from registries
      # SSH to datanode and delete all docker images created by the system tests by restarting the docker-registry
      # deployment and the datanode docker-registry
      run: |
        sshpass \
          -p "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_PASSWORD }}" \
          ssh \
          -o StrictHostKeyChecking=no \
          -o ServerAliveInterval=180 \
          -o ServerAliveCountMax=3 \
          ${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_USERNAME }}@${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_IP }} \
          kubectl -n default-tenant rollout restart deployment docker-registry

        sshpass \
          -p "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_PASSWORD }}" \
          scp \
          automation/system_test/cleanup.py \
          ${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_USERNAME }}@${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_IP }}:/home/iguazio/cleanup.py

        sshpass \
          -p "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_PASSWORD }}" \
          scp \
          automation/system_test/dev_utilities.py \
          ${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_USERNAME }}@${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_IP }}:/home/iguazio/dev_utilities.py

  prepare-system-tests-enterprise-ci:
    # When increasing the timeout make sure it's not larger than the schedule cron interval
    timeout-minutes: 55
    name: Prepare System Tests Enterprise
    runs-on: [ self-hosted, Linux ]
    container:
      image: python:3.9
    needs: [system-test-cleanup]
    # let's not run this on every fork, change to your fork when developing
    if: github.repository == 'mlrun/mlrun' || github.event_name == 'workflow_dispatch'

    steps:
    - uses: actions/checkout@v3
    - name: Install dependencies
      run: |
        apt-get update -qqy && apt-get install -y sshpass jq curl gnupg nodejs
    - uses: actions/setup-node@v4
      with:
        node-version: 18
    - name: Copy state branch file from remote
      run: |
        sshpass \
          -p "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_PASSWORD }}" \
          scp \
          -o StrictHostKeyChecking=no \
          ${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_USERNAME }}@${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_IP }}:/tmp/system-tests-branches-list.txt \
          system-tests-branches-list.txt

    - name: Resolve Branch To Run System Tests
      id: current-branch
      shell: bash
      # we store a file named /tmp/system-tests-branches-list.txt which contains a list of branches to run system tests
      # on the branches are separated with commas, so each run we pop the first branch in the list and append it to the
      # end of the list.
      # This mechanism allows us to run on multiple branches without the need to modify the file or secrets each time
      # a new branch is added or removed
      run: |
        # Read branches from local file
        branches=$(cat system-tests-branches-list.txt)
        echo "branches found in system-tests-branches-list.txt: $branches"

        # Split branches into an array
        IFS=',' read -ra branches_array <<< "$branches"

        # Get the first branch in the list to work on
        first_branch="${branches_array[0]}"
        echo "working on $first_branch"

        # Remove the first branch from the list
        branches_array=("${branches_array[@]:1}")

        # Add the first branch at the end of the list
        branches_array+=("$first_branch")

        # Join branches back into a string
        branches=$(printf ",%s" "${branches_array[@]}")
        branches=${branches:1}

        # Output the new list of branches
        echo "$branches"

        # Write new branches order to a local file
        echo "$branches" | cat > system-tests-branches-list.txt

        # Set output
        echo "name=$(echo $first_branch)" >> $GITHUB_OUTPUT

    - name: Override remote file from local resolved branch list
      run: |
        # Override the remote file with the new list of branches
        sshpass \
          -p "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_PASSWORD }}" \
          scp \
          -o StrictHostKeyChecking=no system-tests-branches-list.txt \
          ${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_USERNAME }}@${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_IP }}:/tmp/

      # checking out to base branch and not the target(resolved) branch, to be able to run the changed preparation code
      # before merging the changes to upstream.
    - name: Checkout base branch
      uses: actions/checkout@v3

    - name: Install automation scripts dependencies
      run: |
        python -m pip install -r automation/requirements.txt

    - name: Extract git hashes from upstream and latest version
      id: git_upstream_info
      run: |
        
        # Get the latest commit of mlrun/mlrun (that is older than 1 hour)
        echo "mlrun_hash=$( \
          cd /tmp && \
          git clone --single-branch --branch ${{ steps.current-branch.outputs.name }} https://github.com/mlrun/mlrun.git mlrun-upstream 2> /dev/null && \
          cd mlrun-upstream && \
          git rev-list --until="1 hour ago" --max-count 1 --abbrev-commit HEAD && \
          cd .. && \
          rm -rf mlrun-upstream)" >> $GITHUB_OUTPUT
        
        # Get the latest commit of mlrun/ui (that is older than 1 hour)
        echo "ui_hash=$( \
          cd /tmp && \
          git clone --single-branch --branch ${{ steps.current-branch.outputs.name }} https://github.com/mlrun/ui.git mlrun-ui 2> /dev/null && \
          cd mlrun-ui && \
          git rev-list --until="1 hour ago" --max-count 1 --abbrev-commit HEAD && \
          cd .. && \
          rm -rf mlrun-ui)" >> $GITHUB_OUTPUT
        
        # Get the tested mlrun version
        echo "unstable_version_prefix=$( \
          curl https://raw.githubusercontent.com/mlrun/mlrun/${{ steps.current-branch.outputs.name }}/automation/version/unstable_version_prefix \
        )" >> $GITHUB_OUTPUT

    - name: Set computed versions params
      id: computed_params
      run: |
        action_mlrun_hash=${{ steps.git_action_info.outputs.mlrun_hash }} && \
        upstream_mlrun_hash=${{ steps.git_upstream_info.outputs.mlrun_hash }} && \
        export mlrun_hash=${upstream_mlrun_hash:-`echo $action_mlrun_hash`}
        echo "mlrun_hash=$(echo $mlrun_hash)" >> $GITHUB_OUTPUT
        action_mlrun_ui_hash=${{ steps.git_action_ui_info.outputs.ui_hash }} && \
        upstream_mlrun_ui_hash=${{ steps.git_upstream_info.outputs.ui_hash }} && \
        export ui_hash=${upstream_mlrun_ui_hash:-`echo $action_mlrun_ui_hash`}
        echo "ui_hash=$(echo $ui_hash)" >> $GITHUB_OUTPUT
        echo "mlrun_version=$(echo ${{ steps.git_upstream_info.outputs.unstable_version_prefix }}+$mlrun_hash)" >> $GITHUB_OUTPUT
        echo "mlrun_docker_tag=$(echo ${{ steps.git_upstream_info.outputs.unstable_version_prefix }}-$mlrun_hash)" >> $GITHUB_OUTPUT
        echo "mlrun_ui_version=${{ steps.git_upstream_info.outputs.unstable_version_prefix }}-$ui_hash" >> $GITHUB_OUTPUT
        echo "mlrun_docker_registry=$( \
          input_docker_registry=$INPUT_DOCKER_REGISTRY && \
          echo ${input_docker_registry:-ghcr.io/})" >> $GITHUB_OUTPUT
        echo "mlrun_system_tests_clean_resources=$( \
          input_system_tests_clean_resources=$INPUT_CLEAN_RESOURCES_IN_TEARDOWN && \
          echo ${input_system_tests_clean_resources:-true})" >> $GITHUB_OUTPUT
        echo "override_iguazio_version=$INPUT_OVERRIDE_IGUAZIO_VERSION" >> $GITHUB_OUTPUT
      env:
        INPUT_DOCKER_REGISTRY: ${{ github.event.inputs.docker_registry }}
        INPUT_OVERRIDE_IGUAZIO_VERSION: ${{ github.event.inputs.override_iguazio_version }}
        INPUT_CLEAN_RESOURCES_IN_TEARDOWN: ${{ github.event.inputs.clean_resources_in_teardown }}

    - name: Prepare System Test Environment and Install MLRun
      env:
        IP_ADDR_PREFIX: ${{ secrets.IP_ADDR_PREFIX }}
      timeout-minutes: 50
      run: |
        python automation/system_test/prepare.py run \
          --data-cluster-ip "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_IP }}" \
          --data-cluster-ssh-username "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_USERNAME }}" \
          --data-cluster-ssh-password "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_PASSWORD }}" \
          --provctl-download-url "${{ secrets.LATEST_SYSTEM_TEST_PROVCTL_DOWNLOAD_PATH }}" \
          --provctl-download-s3-access-key "${{ secrets.LATEST_SYSTEM_TEST_PROVCTL_DOWNLOAD_URL_S3_ACCESS_KEY }}" \
          --provctl-download-s3-key-id "${{ secrets.LATEST_SYSTEM_TEST_PROVCTL_DOWNLOAD_URL_S3_KEY_ID }}" \
          --username "${{ secrets.LATEST_SYSTEM_TEST_USERNAME }}" \
          --access-key "${{ secrets.LATEST_SYSTEM_TEST_ACCESS_KEY }}" \
          --iguazio-version "${{ steps.computed_params.outputs.iguazio_version }}" \
          --mlrun-version "${{ steps.computed_params.outputs.mlrun_version }}" \
          --mlrun-ui-version "${{ steps.computed_params.outputs.mlrun_ui_version }}" \
          --mlrun-commit "${{ steps.computed_params.outputs.mlrun_hash }}" \
          --override-image-registry "${{ steps.computed_params.outputs.mlrun_docker_registry }}"

    - name: Prepare System Test env.yml and MLRun installation from current branch
      timeout-minutes: 5
      run: |
        python automation/system_test/prepare.py env \
          --data-cluster-ip "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_IP }}" \
          --data-cluster-ssh-username "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_USERNAME }}" \
          --data-cluster-ssh-password "${{ secrets.LATEST_SYSTEM_TEST_DATA_CLUSTER_SSH_PASSWORD }}" \
          --username "${{ secrets.LATEST_SYSTEM_TEST_USERNAME }}" \
          --access-key "${{ secrets.LATEST_SYSTEM_TEST_ACCESS_KEY }}" \
          --slack-webhook-url "${{ secrets.LATEST_SYSTEM_TEST_SLACK_WEBHOOK_URL }}" \
          --branch "${{ needs.prepare-system-tests-enterprise-ci.outputs.mlrunBranch }}" \
          --github-access-token "${{ secrets.SYSTEM_TEST_GITHUB_ACCESS_TOKEN }}" \
          --save-to-path "$GITHUB_WORKSPACE/env.yml"

    - name: Encrypt file
      id: encrypt_file
      run: |
        gpg \
          --batch \
          --passphrase "${{ env.GPG_PASSPHRASE }}" \
          --output "$GITHUB_WORKSPACE/env.yml.gpg" \
          --symmetric "$GITHUB_WORKSPACE/env.yml"
        
        echo "env_file_path=$(echo $GITHUB_WORKSPACE/env.yml.gpg)" >> $GITHUB_OUTPUT
      env:
        GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}

    - name: Upload env file
      uses: actions/upload-artifact@v3
      with:
        name: env
        path: "${{ steps.encrypt_file.outputs.env_file_path }}"
        if-no-files-found: error

    outputs:
      mlrunVersion: ${{ steps.computed_params.outputs.mlrun_version }}
      mlrunBranch: ${{ steps.current-branch.outputs.name }}
      mlrunSystemTestsCleanResources: ${{ steps.computed_params.outputs.mlrun_system_tests_clean_resources }}

  run-system-tests-enterprise-ci:
    # When increasing the timeout make sure it's not larger than the schedule cron interval
    timeout-minutes: 360
    name: Test ${{ matrix.test_component }} [${{ needs.prepare-system-tests-enterprise-ci.outputs.mlrunBranch }}]
    # requires prepare to finish before starting
    needs: [prepare-system-tests-enterprise-ci]
    runs-on: [ self-hosted, Linux ]
    container:
      image: python:3.9
    # let's not run this on every fork, change to your fork when developing
    if: github.repository == 'mlrun/mlrun' || github.event_name == 'workflow_dispatch'
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
        test_component:
        - api
        - runtimes
        - projects
        - model_monitoring
        - examples
        - backwards_compatibility
        - datastore
        - logs
        - feature_store
    steps:
    - name: Install Dependencies
      run: |
        # gnupg         for decrypting env.yml.gpg
        # nodejs        for installing github action dependencies
        # graphviz      for generating graphs (feature store tests)
        # git-core      for cloning repos and etc, required by many suites
        # gcc, make     for installing python packages
        apt-get update -qqy && \
          apt-get install -y \
            gnupg \
            nodejs \
            graphviz \
            git-core \
            gcc \
            make

    - uses: actions/setup-node@v4
      with:
        node-version: 18

    - uses: actions/checkout@v3
      # checking out to the resolved branch to run system tests on, as now we run the actual tests, we don't want to run
      # the system tests of the branch that triggered the system tests as it might be in a different version
      # than the mlrun version we deployed on the previous job (can have features that the resolved branch doesn't have)
      with:
        ref: ${{ needs.prepare-system-tests-enterprise-ci.outputs.mlrunBranch }}

    - uses: actions/download-artifact@v3
      with:
        name: env
        path: /tmp

    - name: Decrypt file
      run: |
        gpg \
          --batch \
          --passphrase "${{ env.GPG_PASSPHRASE }}" \
          --output "$GITHUB_WORKSPACE/tests/system/env.yml" \
          --decrypt "/tmp/env.yml.gpg"

        # ensure file is created
        test -f "$GITHUB_WORKSPACE/tests/system/env.yml"
      env:
        GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}

    - name: Pre Tests Requirements
      run: |
        
        # due to
        # https://github.com/git/git/commit/8959555cee7ec045958f9b6dd62e541affb7e7d9
        # ensure mlrun git is safe to use
        git config --global --add safe.directory "$GITHUB_WORKSPACE"
        MLRUN_VERSION="${{ needs.prepare-system-tests-enterprise-ci.outputs.mlrunVersion }}" \
          make install-requirements install-complete-requirements update-version-file

    - name: Run System Tests
      run: |
        MLRUN_SYSTEM_TESTS_GITHUB_RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
        MLRUN_SYSTEM_TESTS_CLEAN_RESOURCES="${{ needs.prepare-system-tests-enterprise-ci.outputs.mlrunSystemTestsCleanResources }}" \
        MLRUN_VERSION="${{ needs.prepare-system-tests-enterprise-ci.outputs.mlrunVersion }}" \
        MLRUN_SYSTEM_TESTS_COMPONENT="${{ matrix.test_component }}" \
        MLRUN_SYSTEM_TESTS_BRANCH="${{ needs.prepare-system-tests-enterprise-ci.outputs.mlrunBranch }}" \
          make test-system