/
dask-environment.yaml
568 lines (542 loc) · 16.3 KB
/
dask-environment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
Parameters:
SagemakerCodeRepo:
Type: String
Description: Github Repository for loading into the Sagemaker Jupyter environment
Default: https://github.com/awslabs/amazon-asdi.git
SagemakerNotebookInstance:
Type: String
Description: Sagemaker Notebook instance type
AllowedValues:
- ml.t3.medium
- ml.t3.large
- ml.t3.xlarge
- ml.t3.2xlarge
- ml.m5.large
- ml.m5.2xlarge
- ml.m5.4xlarge
Default: ml.m5.2xlarge
DaskImage:
Type: String
Default: daskdev/dask:latest
Description: Container image to use for the Dask workers before our custom image is built
DaskWorkerCPU:
Type: String
Default: '1024'
Description: CPU units to assign to dask workers (1024 = 1 vCPU)
DaskWorkerMemory:
Type: String
Default: '8192'
Description: Memory in MiB for dask workers
DaskWorkerSpotPricing:
Type: String
Default: 'NO'
Description: Use Fargate spot pricing for dask workers, to save money (workers may be terminated)
AllowedValues:
- 'YES'
- 'NO'
Conditions:
DaskWorkerSpotPricingCondition: !Equals [!Ref DaskWorkerSpotPricing, 'YES']
Resources:
DaskVpc:
Type: AWS::EC2::VPC
Properties:
CidrBlock: 10.10.0.0/16
EnableDnsHostnames: true
EnableDnsSupport: true
InstanceTenancy: default
Tags:
- Key: Name
Value: dask-fargate/MyVpc
PublicSubnet1:
Type: AWS::EC2::Subnet
Properties:
CidrBlock: 10.10.0.0/18
VpcId:
Ref: DaskVpc
AvailabilityZone:
Fn::Select:
- 0
- Fn::GetAZs: ""
MapPublicIpOnLaunch: true
Tags:
- Key: Name
Value: dask-fargate/PublicSubnet1
PublicSubnet1RouteTable:
Type: AWS::EC2::RouteTable
Properties:
VpcId:
Ref: DaskVpc
Tags:
- Key: Name
Value: dask-fargate/MyVpc/PublicSubnet1
PublicSubnet1RouteTableAssociation:
Type: AWS::EC2::SubnetRouteTableAssociation
Properties:
RouteTableId:
Ref: PublicSubnet1RouteTable
SubnetId:
Ref: PublicSubnet1
PublicSubnet1DefaultRoute:
Type: AWS::EC2::Route
Properties:
RouteTableId:
Ref: PublicSubnet1RouteTable
DestinationCidrBlock: 0.0.0.0/0
GatewayId:
Ref: InternetGateway
DependsOn:
- VPCGatewayA
PublicSubnet1EIP:
Type: AWS::EC2::EIP
Properties:
Domain: vpc
Tags:
- Key: Name
Value: dask-fargate/MyVpc/PublicSubnet1
InternetGateway:
Type: AWS::EC2::InternetGateway
Properties:
Tags:
- Key: Name
Value: dask-fargate/MyVpc
VPCGatewayA:
Type: AWS::EC2::VPCGatewayAttachment
Properties:
VpcId:
Ref: DaskVpc
InternetGatewayId:
Ref: InternetGateway
NATGateway:
Type: AWS::EC2::NatGateway
Properties:
AllocationId: !GetAtt PublicSubnet1EIP.AllocationId
SubnetId: !Ref PublicSubnet1
PrivateSubnet1:
Type: AWS::EC2::Subnet
Properties:
CidrBlock: 10.10.64.0/18
VpcId:
Ref: DaskVpc
AvailabilityZone:
Fn::Select:
- 0
- Fn::GetAZs: ""
Tags:
- Key: Name
Value: dask-fargate/PrivateSubnet1
PrivateSubnet1RouteTable:
Type: AWS::EC2::RouteTable
Properties:
VpcId:
Ref: DaskVpc
Tags:
- Key: Name
Value: dask-fargate/MyVpc/PrivateSubnet1
PrivateSubnet1RouteTableAssociation:
Type: AWS::EC2::SubnetRouteTableAssociation
Properties:
RouteTableId:
Ref: PrivateSubnet1RouteTable
SubnetId:
Ref: PrivateSubnet1
PrivateSubnet1DefaultRoute:
Type: AWS::EC2::Route
Properties:
RouteTableId:
Ref: PrivateSubnet1RouteTable
DestinationCidrBlock: 0.0.0.0/0
NatGatewayId:
Ref: NATGateway
PrivateSubnetS3Endpoint:
Type: AWS::EC2::VPCEndpoint
Properties:
RouteTableIds:
- !Ref PrivateSubnet1RouteTable
ServiceName: !Sub 'com.amazonaws.${AWS::Region}.s3'
VpcId: !Ref DaskVpc
LogGroupScheduler:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /aws/${AWS::StackName}/dask/scheduler
RetentionInDays: 3
UpdateReplacePolicy: Delete
DeletionPolicy: Delete
LogGroupWorker:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /aws/${AWS::StackName}/dask/worker
RetentionInDays: 3
UpdateReplacePolicy: Delete
DeletionPolicy: Delete
ECSExecutionRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Action: sts:AssumeRole
Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Version: "2012-10-17"
ECSExecutionRolePolicy:
Type: AWS::IAM::Policy
Properties:
PolicyDocument:
Statement:
- Action:
- ecr:BatchCheckLayerAvailability
- ecr:GetDownloadUrlForLayer
- ecr:BatchGetImage
- ecr:GetAuthorizationToken
Effect: Allow
Resource: "*"
- Action:
- logs:CreateLogStream
- logs:PutLogEvents
Effect: Allow
Resource:
Fn::GetAtt:
- LogGroupScheduler
- Arn
- Action:
- logs:CreateLogStream
- logs:PutLogEvents
Effect: Allow
Resource:
Fn::GetAtt:
- LogGroupWorker
- Arn
Version: "2012-10-17"
PolicyName: ECSExecutionRolePolicy
Roles:
- Ref: ECSExecutionRole
DaskCluster:
Type: AWS::ECS::Cluster
Properties:
ClusterName: !Join ["-", [!Ref "AWS::StackName", "Fargate-Dask-Cluster"]]
CapacityProviders:
- FARGATE
- FARGATE_SPOT
DefaultCapacityProviderStrategy:
- CapacityProvider: FARGATE
Weight: 1
- CapacityProvider: FARGATE_SPOT
Weight: 0
DaskClusterPrivateNS:
Type: AWS::ServiceDiscovery::PrivateDnsNamespace
Properties:
Name: local-dask
Vpc:
Ref: DaskVpc
SchedulerDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
ContainerDefinitions:
- Command:
- dask-scheduler
- --dashboard
- --dashboard-address
- '8787'
Cpu: 4096
Essential: true
Image: !Ref DaskImage
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group:
Ref: LogGroupScheduler
awslogs-stream-prefix: ecs
awslogs-region:
Ref: AWS::Region
Memory: 30720
MemoryReservation: 30720
Name: Dask
Cpu: "4096"
ExecutionRoleArn:
Fn::GetAtt:
- ECSExecutionRole
- Arn
Family: Dask-Scheduler
Memory: "30720"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
TaskRoleArn:
Fn::GetAtt:
- ECSExecutionRole
- Arn
WorkerDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
ContainerDefinitions:
- Command:
- dask-worker
- dask-scheduler.local-dask:8786
- --memory-limit
- '7000MB'
- --worker-port
- '9000'
- --no-nanny
- --no-dashboard
- --death-timeout
- '30'
- --nthreads
- '2'
- --nprocs
- '1'
- --reconnect
Cpu: !Ref DaskWorkerCPU
Essential: true
Image: !Ref DaskImage
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group:
Ref: LogGroupWorker
awslogs-stream-prefix: ecs
awslogs-region:
Ref: AWS::Region
Memory: !Ref DaskWorkerMemory
MemoryReservation: !Ref DaskWorkerMemory
Name: Dask
Cpu: !Ref DaskWorkerCPU
ExecutionRoleArn:
Fn::GetAtt:
- ECSExecutionRole
- Arn
Family: Dask-Worker
Memory: !Ref DaskWorkerMemory
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
TaskRoleArn:
Fn::GetAtt:
- ECSExecutionRole
- Arn
DaskSchedulerSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Enable Scheduler ports access
GroupName: DaskSchedulerSecurityGroup
SecurityGroupEgress:
- CidrIp: 0.0.0.0/0
Description: Allow all outbound traffic by default
IpProtocol: "-1"
SecurityGroupIngress:
- CidrIp: 0.0.0.0/0
Description: Allow access to the dask dashboard
FromPort: 8787
IpProtocol: tcp
ToPort: 8787
- SourceSecurityGroupId: !GetAtt NotebookSecurityGroup.GroupId
Description: Allow access from SageMaker notebook
FromPort: 8786
IpProtocol: tcp
ToPort: 8789
VpcId: !Ref DaskVpc
NotebookSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Allow access to SageMaker notebook instance
GroupName: NotebookSecurityGroup
SecurityGroupEgress:
- CidrIp: 0.0.0.0/0
Description: Allow all outbound traffic by default
IpProtocol: '-1'
VpcId: !Ref DaskVpc
DaskWorkerSecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Restrict connectivity to dask workers
GroupName: DaskWorkerSecurityGroup
SecurityGroupEgress:
- CidrIp: 0.0.0.0/0
Description: Allow all outbound traffic by default
IpProtocol: '-1'
SecurityGroupIngress:
- SourceSecurityGroupId: !GetAtt DaskSchedulerSecurityGroup.GroupId
Description: Allow full access from dask scheduler
IpProtocol: '-1'
- SourceSecurityGroupId: !GetAtt NotebookSecurityGroup.GroupId
Description: Allow full access from the notebook
IpProtocol: '-1'
VpcId: !Ref DaskVpc
DaskWorkerToWorkerIngress:
Type: AWS::EC2::SecurityGroupIngress
Properties:
Description: Allow dask workers to communicate freely with each other
GroupId: !GetAtt DaskWorkerSecurityGroup.GroupId
SourceSecurityGroupId: !GetAtt DaskWorkerSecurityGroup.GroupId
IpProtocol: '-1'
DaskWorkerToSchedulerIngress:
Type: AWS::EC2::SecurityGroupIngress
Properties:
Description: Allow dask workers to communicate with the scheduler
GroupId: !GetAtt DaskSchedulerSecurityGroup.GroupId
SourceSecurityGroupId: !GetAtt DaskWorkerSecurityGroup.GroupId
IpProtocol: '-1'
DaskSchedulerService:
Type: AWS::ECS::Service
Properties:
Cluster:
Ref: DaskCluster
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DesiredCount: 1
EnableECSManagedTags: false
LaunchType: FARGATE
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Fn::GetAtt:
- DaskSchedulerSecurityGroup
- GroupId
Subnets:
- Ref: PublicSubnet1
ServiceName: Dask-Scheduler
ServiceRegistries:
- RegistryArn:
Fn::GetAtt:
- DaskSchedulerServiceDiscovery
- Arn
TaskDefinition:
Ref: SchedulerDefinition
DaskSchedulerServiceDiscovery:
Type: AWS::ServiceDiscovery::Service
Properties:
DnsConfig:
DnsRecords:
- TTL: 5
Type: A
NamespaceId:
Fn::GetAtt:
- DaskClusterPrivateNS
- Id
RoutingPolicy: MULTIVALUE
HealthCheckCustomConfig:
FailureThreshold: 1
Name: Dask-Scheduler
NamespaceId:
Fn::GetAtt:
- DaskClusterPrivateNS
- Id
DaskWorkerService:
Type: AWS::ECS::Service
Properties:
CapacityProviderStrategy:
- CapacityProvider:
Fn::If:
- DaskWorkerSpotPricingCondition
- FARGATE_SPOT
- FARGATE
Weight: 1
Cluster:
Ref: DaskCluster
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DesiredCount: 0
EnableECSManagedTags: false
NetworkConfiguration:
AwsvpcConfiguration:
SecurityGroups:
- Fn::GetAtt:
- DaskWorkerSecurityGroup
- GroupId
Subnets:
- Ref: PrivateSubnet1
ServiceName: Dask-Worker
TaskDefinition:
Ref: WorkerDefinition
SagemakerRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Statement:
- Action: sts:AssumeRole
Effect: Allow
Principal:
Service: sagemaker.amazonaws.com
Version: "2012-10-17"
SagemakerPolicy:
Type: AWS::IAM::Policy
Properties:
PolicyDocument:
Statement:
- Action:
- s3:*
- ecs:*
- logs:*
- ec2:DescribeNetworkInterfaces
Effect: Allow
Resource: "*"
Version: "2012-10-17"
PolicyName: notebookAccessPolicy
Roles:
- Ref: SagemakerRole
NotebookLifecycleConfig:
Type: AWS::SageMaker::NotebookInstanceLifecycleConfig
Properties:
OnStart:
- Content:
Fn::Base64: |
#!/bin/sh
set -e
cat > /home/ec2-user/setup.sh << EOF
#!/bin/bash
sleep 10
echo "\$(date) Creating daskpy3 conda environment"
echo "\$(date) Running in directory ${PWD}"
conda create --name daskpy3 python="3.7.10" -y
source activate daskpy3
conda install -c conda micromamba -y
micromamba install -c conda basemap proj4 nodejs ipywidgets -y
pip install botocore==1.20.106 aiobotocore==1.4.1 zarr==2.9.5 rechunker==0.4.2 ipykernel==6.3.1 boto3==1.17.106 dask==2021.8.1 distributed==2021.8.1 tornado==6.1 cloudpickle==1.6.0 msgpack==1.0.2 blosc==1.10.2 numpy==1.21.2 pandas==1.3.2 lz4==3.1.3 netcdf4==1.5.7 xarray==0.19.0 bokeh==2.2.3 s3fs==2021.8.1 fsspec==2021.8.1 h5netcdf==0.11.0 h5py==3.4.0 intake-esm==2021.8.17 intake==0.6.3
python -m ipykernel install --user --name daskpy3 --display-name "conda_daskpy3"
echo "\$(date) Updating Jupyter config options"
cp ~/.jupyter/jupyter_notebook_config.py ~/.jupyter/jupyter_notebook_config.py.bak
sed -i 's/^#c.NotebookApp.iopub_data_rate_limit.*$/c.NotebookApp.iopub_data_rate_limit = 1.0e10/' ~/.jupyter/jupyter_notebook_config.py
sed -i 's/^#c.NotebookApp.iopub_msg_rate_limit.*$/c.NotebookApp.iopub_msg_rate_limit = 1.0e10/' ~/.jupyter/jupyter_notebook_config.py
source /home/ec2-user/anaconda3/bin/deactivate
source activate JupyterSystemEnv
jupyter labextension install @jupyter-widgets/jupyterlab-manager@1.1
source /home/ec2-user/anaconda3/bin/deactivate
echo "\$(date) Finished!"
EOF
chown ec2-user:ec2-user /home/ec2-user/setup.sh
chmod +x /home/ec2-user/setup.sh
sudo -u ec2-user -i nohup /home/ec2-user/setup.sh >/home/ec2-user/output.log 2>&1 &
DaskNotebook:
Type: AWS::SageMaker::NotebookInstance
Properties:
InstanceType: !Ref SagemakerNotebookInstance
RoleArn:
Fn::GetAtt:
- SagemakerRole
- Arn
DefaultCodeRepository: !Ref SagemakerCodeRepo
DirectInternetAccess: Enabled
LifecycleConfigName: !GetAtt NotebookLifecycleConfig.NotebookInstanceLifecycleConfigName
RootAccess: Enabled
SecurityGroupIds:
- Fn::GetAtt:
- NotebookSecurityGroup
- GroupId
SubnetId:
Ref: PublicSubnet1
VolumeSizeInGB: 50
Outputs:
JupyterNotebook:
Value: !Join
- ''
- - https://
- !Ref 'AWS::Region'
- .console.aws.amazon.com/sagemaker/home?region=
- !Ref 'AWS::Region'
- '#/notebook-instances/openNotebook/'
- !GetAtt 'DaskNotebook.NotebookInstanceName'
- '?view=lab'
DaskECSClusterName:
Value: !Ref DaskCluster