forked from Lightning-AI/pytorch-lightning
-
Notifications
You must be signed in to change notification settings - Fork 0
150 lines (135 loc) · 5.18 KB
/
tpu-tests.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
name: Test PyTorch - TPU
on:
push:
branches: [master, "release/*"]
pull_request_target:
branches: [master, "release/*"]
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
paths:
- ".actions/**"
- ".github/workflows/tpu-tests.yml"
- "dockers/base-xla/*"
- "requirements/fabric/**"
- "src/lightning_fabric/**"
- "tests/tests_fabric/**"
- "requirements/pytorch/**"
- "src/pytorch_lightning/**"
- "tests/tests_pytorch/**"
- "setup.cfg" # includes pytest config
- "!requirements/*/docs.txt"
- "!*.md"
- "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
env:
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
GKE_CLUSTER: lightning-cluster
GKE_ZONE: us-central1-a
defaults:
run:
shell: bash
jobs:
test-on-tpus:
runs-on: ubuntu-22.04
if: github.event.pull_request.draft == false
env:
PYTHON_VER: 3.7
strategy:
fail-fast: false
max-parallel: 1 # run sequential
matrix:
# TODO: add also lightning
pkg-name: ["fabric", "pytorch"]
timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet`
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
- uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VER }}
- name: Checkout ml-testing-accelerators
run: |
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
cd ml-testing-accelerators
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
git checkout stable
- uses: actions/setup-go@v3
with:
go-version: '1.19'
- name: Install jsonnet
run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest
- name: Update jsonnet
env:
SCOPE: ${{ matrix.pkg-name }}
XLA_VER: 1.12
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA: ${{ github.event.pull_request.head.sha }}
run: |
import os
fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet'
with open(fname) as fo:
data = fo.read()
data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER"))
data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER"))
data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER"))
data = data.replace('{SHA}', os.getenv("SHA"))
with open(fname, "w") as fw:
fw.write(data)
shell: python
- name: Show jsonnet
run: cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet
- uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
# https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine
- uses: google-github-actions/get-gke-credentials@v1
with:
cluster_name: ${{ env.GKE_CLUSTER }}
location: ${{ env.GKE_ZONE }}
- name: Deploy cluster
run: |
export PATH=$PATH:$HOME/go/bin
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -)
job_name=${job_name#job.batch/}
job_name=${job_name% created}
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
echo "GKE pod name: $pod_name"
echo "Waiting on kubernetes job: $job_name"
status_code=2 &&
# Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes.
printf "Waiting for job to finish: "
while true; do
if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then
status_code=1 && break;
elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then
status_code=0 && break;
else
printf ".";
fi;
sleep 5;
done
echo "Done waiting. Job status code: $status_code"
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt; then
# successful run. split the output into logs + coverage report
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/';
cat xx00 # test logs
mv xx01 coverage.xml
else
# failed run, print everything
cat /tmp/full_output.txt;
fi
exit $status_code
shell: bash
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
# see: https://github.com/actions/toolkit/issues/399
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: tpu,pytest,python${{ env.PYTHON_VER }}
name: TPU-coverage
fail_ci_if_error: false