170 lines (141 loc) · 5.86 KB
/
update_repo_datasets.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
name: Update combined datasets
# Use a concurrency group to make sure we don't try to have multiple workflows
# run with the hosted runner at the same time.
concurrency: gce-runner
on:
# 2023/01/17: We no longer have a run scheduled via github actions and instead rely on the prefect
# scrapers to kick off a build after NYT updates each day.
# https://github.com/covid-projections/can-scrapers/blob/643819f7a42d0ae227453b1de24a317a54c6cde8/services/prefect/flows/scheduled_nyt_and_parquet_updater.py#L37
#
# schedule:
# # Run job everyday at 5:00 am EST
# - cron: '0 10 * * *'
workflow_dispatch:
inputs:
trigger_api_build:
description: 'If "true" API snapshot build will be triggered after dataset update.'
default: 'true'
refresh_datasets:
description: 'Set to "false" to skip downloading / re-combining the latest datasets.'
default: 'true'
repository_dispatch:
env:
# Used by python code that reports errors to sentry.
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
SENTRY_ENVIRONMENT: 'production'
# use a webhook to write to slack channel dev-alerts for QA
SLACK_DEV_ALERTS_WEBHOOK: ${{ secrets.SLACK_DEV_ALERTS_WEBHOOK }}
DATA_AVAILABILITY_SHEET_NAME: "Data Availability"
GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA: ${{ secrets.GOOGLE_SHEETS_SERVICE_ACCOUNT_DATA }}
# Use trigger_api_build if specified, else use true. This automatically triggers
# the API build on scheduled runs where trigger_api_build is not defined.
# https://github.community/t/how-can-you-use-expressions-as-the-workflow-dispatch-input-default/141454/4
TRIGGER_API_BUILD: ${{ github.event.inputs.trigger_api_build || 'true' }}
REFRESH_DATASETS_ARG: ${{ (github.event.inputs.refresh_datasets == 'true') && '--refresh-datasets' || '--no-refresh-datasets' }}
# The GCE instance to start / stop before / after running the job.
GCE_ZONE: "us-west1-b"
GCE_INSTANCE: "can-actions-runner"
jobs:
start-runner:
runs-on: ubuntu-latest
steps:
- id: "auth"
uses: "google-github-actions/auth@v1"
with:
credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}"
- name: "Set up Cloud SDK"
uses: "google-github-actions/setup-gcloud@v1"
- name: "Start ${{env.GCE_INSTANCE}} VM."
run: "gcloud compute instances start --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}"
update-and-promote-datasets:
needs: "start-runner"
runs-on: gce-runner
steps:
- name: Parse covid data model branch name and set env variable
run: |
echo "COVID_DATA_MODEL_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
- name: Checkout covid-data-model
uses: actions/checkout@v2
with:
repository: act-now-coalition/covid-data-model
path: covid-data-model
ref: '${{env.COVID_DATA_MODEL_REF}}'
lfs: true
- name: Update NYTimes anomalies file
working-directory: ./covid-data-model
run: |
curl https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/anomalies.csv --output data/nyt_anomalies.csv
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Cache Pip
uses: actions/cache@v1
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
${{ runner.os }}-
- name: Install Dependencies
working-directory: ./covid-data-model
run: pip install -r requirements.txt
- name: prune covid-data-model
working-directory: ./covid-data-model
run: git lfs prune
- name: Update and Promote dataset.
working-directory: ./covid-data-model
run: |
./run.py data update ${{env.REFRESH_DATASETS_ARG}}
- name: Create Update Commit
working-directory: ./covid-data-model
run: ./tools/push-data-update.sh
- name: Maybe Trigger API build
if: env.TRIGGER_API_BUILD == 'true'
working-directory: ./covid-data-model
env:
GITHUB_TOKEN: ${{ secrets.CAN_ROBOT_PERSONAL_ACCESS_TOKEN }}
run: |
./tools/build-snapshot.sh main
- name: Slack notification
if: env.TRIGGER_API_BUILD == 'true'
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
with:
args: 'Started new API build from dataset updater action.'
# TODO(https://trello.com/c/4dFFtQiH/1239-fix-data-availability-report-or-officially-replace-it-with-toms-dashboard)
# - name: Update Data Availability Sheet
# working-directory: ./covid-data-model
# run: |
# ./run.py data update-availability-report
- name: Slack notification
if: failure()
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
STATUS: ${{job.status}}
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
with:
args: 'update-dataset-snapshot failed'
- name: Slack notification
if: success()
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DEV_ALERTS }}
STATUS: ${{job.status}}
DATA_AVAILABILITY_URL: http://tiny.cc/can-data
uses: Ilshidur/action-slack@fb92a78a305a399cd6d8ede99d641f2b9224daf3
with:
args: 'update-dataset-snapshot succeeded. View Data Availability Report at {{DATA_AVAILABILITY_URL}}'
stop-runner:
if: ${{ always() }}
needs: ["start-runner", "update-and-promote-datasets"]
runs-on: ubuntu-latest
steps:
- id: "auth"
uses: "google-github-actions/auth@v1"
with:
credentials_json: "${{ secrets.GCE_ADMIN_SERVICE_ACCOUNT }}"
- name: "Set up Cloud SDK"
uses: "google-github-actions/setup-gcloud@v1"
- name: "Stop ${{env.GCE_INSTANCE}} VM."
run: "gcloud compute instances stop --zone ${{env.GCE_ZONE}} ${{env.GCE_INSTANCE}}"