-
Notifications
You must be signed in to change notification settings - Fork 27
/
imagenet-efa.yaml
78 lines (78 loc) · 2.75 KB
/
imagenet-efa.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
apiVersion: elastic.pytorch.org/v1alpha1
kind: ElasticJob
metadata:
name: imagenet-efa
#namespace: elastic-job
spec:
# Use "etcd-service:2379" if you already applied etcd.yaml
rdzvEndpoint: etcd-service:2379
minReplicas: 1
maxReplicas: 128
replicaSpecs:
Worker:
replicas: 2
restartPolicy: ExitCode
template:
apiVersion: v1
kind: Pod
spec:
nodeSelector:
#beta.kubernetes.io/instance-type: p3dn.24xlarge
beta.kubernetes.io/instance-type: p4d.24xlarge
#beta.kubernetes.io/instance-type: g4dn.metal
containers:
- name: elasticjob-worker
image: xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/pytorch-efa:latest
imagePullPolicy: Always
env:
- name: NCCL_DEBUG
value: INFO
- name: NCCL_ALGO
value: Ring
- name: FI_PROVIDER
value: efa
- name: FI_EFA_USE_DEVICE_RDMA
value: "1"
- name: RDMAV_FORK_SAFE
value: "1"
- name: FI_LOG_LEVEL
value: "1"
command: ["torchrun"]
args:
- "--nproc_per_node=8"
- "/workspace/elastic/examples/imagenet/main.py"
- "--arch=efficientnet_b7"
- "--epochs=1"
- "--batch-size=64"
# number of data loader workers (NOT trainers)
# zero means load the data on the same process as the trainer
# pytorch data loaders use shm
- "--workers=4"
- "--checkpoint-file=/fsx-shared/checkpoint.pth.tar"
# This is the directory structure for ImageNet dataset
- "/fsx-shared/ILSVRC/Data/CLS-LOC/"
resources:
limits:
nvidia.com/gpu: 8
hugepages-2Mi: 5120Mi
vpc.amazonaws.com/efa: 4
memory: 80000Mi
requests:
nvidia.com/gpu: 8
hugepages-2Mi: 5120Mi
vpc.amazonaws.com/efa: 4
memory: 80000Mi
volumeMounts:
- name: fsx-pv
mountPath: /fsx-shared
# The following enables the worker pods to use increased shared memory
# which is required when specifying more than 0 data loader workers
- name: dshm
mountPath: /dev/shm
volumes:
- name: fsx-pv
persistentVolumeClaim:
claimName: fsx-pvc
- name: dshm
emptyDir:
medium: Memory