.github/workflows/update_repo_datasets.yml

name: Update combined datasets

# Use a concurrency group to make sure we don't try to have multiple workflows
# run with the hosted runner at the same time.
concurrency: gce-runner

on:
# 2023/01/17: We no longer have a run scheduled via github actions and instead rely on the prefect
# scrapers to kick off a build after NYT updates each day.
# https://github.com/covid-projections/can-scrapers/blob/643819f7a42d0ae227453b1de24a317a54c6cde8/services/prefect/flows/scheduled_nyt_and_parquet_updater.py#L37
#
#  schedule:
#    # Run job everyday at 5:00 am EST
#    - cron: '0 10 * * *'
  workflow_dispatch:
    inputs:
      trigger_api_build:
        description: 'If "true" API snapshot build will be triggered after dataset update.'
        default: 'true'
      refresh_datasets:
        description: 'Set to "false" to skip downloading / re-combining the latest datasets.'
        default: 'true'
  repository_dispatch:

env:

  # Used by python code that reports errors to sentry.
  SENTRY_DSN: ${{ secrets.SENTRY_DSN }}

  SENTRY_ENVIRONMENT: 'production'

  # use a webhook to write to slack channel dev-alerts for QA
  SLACK_DEV_ALERTS_WEBHOOK: ${{ secrets.SLACK_DEV_ALERTS_WEBHOOK }}

  DATA_AVAILABILITY_SHEET_NAME: "Data Availability"

  GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA: ${{ secrets.GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA }}

  # Use trigger_api_build if specified, else use true. This automatically triggers
  # the API build on scheduled runs where trigger_api_build is not defined.
  # https://github.community/t/how-can-you-use-expressions-as-the-workflow-dispatch-input-default/141454/4
  TRIGGER_API_BUILD: ${{ github.event.inputs.trigger_api_build || 'true' }}

  REFRESH_DATASETS_ARG: ${{ (github.event.inputs.refresh_datasets == 'true') && '--refresh-datasets' || '--no-refresh-datasets' }}

  # The GCE instance to start / stop before / after running the job.
  GCE_ZONE: "us-west1-b"
  GCE_INSTANCE: "can-actions-runner"

jobs:
  start-runner:
    runs-on: ubuntu-latest
    steps:
      - id: "auth"
        uses: "google-github-actions/auth@v1"
        with:
          credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}"

      - name: "Set up Cloud SDK"
        uses: "google-github-actions/setup-gcloud@v1"

      - name: "Start ${{env.GCE_INSTANCE}} VM."
        run: "gcloud compute instances start --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}"

  update-and-promote-datasets:
    needs: "start-runner"
    runs-on: gce-runner
    steps:
    - name: Parse covid data model branch name and set env variable
      run: |
        echo "COVID_DATA_MODEL_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV

    - name: Checkout covid-data-model
      uses: actions/checkout@v2
      with:
        repository: act-now-coalition/covid-data-model
        path: covid-data-model
        ref: '${{env.COVID_DATA_MODEL_REF}}'
        lfs: true

    - name: Update NYTimes anomalies file
      working-directory: ./covid-data-model
      run: |
        curl https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/anomalies.csv --output data/nyt_anomalies.csv

    - name: Setup Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.10'

    - name: Cache Pip
      uses: actions/cache@v1
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-
          ${{ runner.os }}-

    - name: Install Dependencies
      working-directory: ./covid-data-model
      run: pip install -r requirements.txt

    - name: prune covid-data-model
      working-directory: ./covid-data-model
      run: git lfs prune

    - name: Update and Promote dataset.
      working-directory: ./covid-data-model
      run: |
        ./run.py data update ${{env.REFRESH_DATASETS_ARG}}

    - name: Create Update Commit
      working-directory: ./covid-data-model
      run: ./tools/push-data-update.sh

    - name: Maybe Trigger API build
      if: env.TRIGGER_API_BUILD == 'true'
      working-directory: ./covid-data-model
      env:
        GITHUB_TOKEN: ${{ secrets.CAN_ROBOT_PERSONAL_ACCESS_TOKEN }}
      run: |
        ./tools/build-snapshot.sh main
    - name: Slack notification
      if: env.TRIGGER_API_BUILD == 'true'
      env:
        SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
      uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
      with:
        args: 'Started new API build from dataset updater action.'

    # TODO(https://trello.com/c/4dFFtQiH/1239-fix-data-availability-report-or-officially-replace-it-with-toms-dashboard)
    # - name: Update Data Availability Sheet
    #   working-directory: ./covid-data-model
    #   run: |
    #     ./run.py data update-availability-report

    - name: Slack notification
      if: failure()
      env:
        SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
        STATUS: ${{job.status}}
      uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
      with:
        args: 'update-dataset-snapshot failed'
    - name: Slack notification
      if: success()
      env:
        SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
        STATUS: ${{job.status}}
        DATA_AVAILABILITY_URL: http://tiny.cc/can-data
      uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
      with:
        args: 'update-dataset-snapshot succeeded. View Data Availability Report at {{DATA_AVAILABILITY_URL}}'

  stop-runner:
    if: ${{ always() }}
    needs: ["start-runner", "update-and-promote-datasets"]
    runs-on: ubuntu-latest
    steps:
      - id: "auth"
        uses: "google-github-actions/auth@v1"
        with:
          credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}"

      - name: "Set up Cloud SDK"
        uses: "google-github-actions/setup-gcloud@v1"

      - name: "Stop ${{env.GCE_INSTANCE}} VM."
        run: "gcloud compute instances stop --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}"