/
slurm.full.yaml
300 lines (300 loc) · 8.88 KB
/
slurm.full.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
Imds:
ImdsSupport: v2.0
Region: us-east-1
Image:
Os: centos7
CustomAmi: ami-12345678
LoginNodes:
Pools:
- Name: test
InstanceType: t2.micro
Count: 1
GracetimePeriod: 10
Image:
CustomAmi: ami-12345678
Networking:
SubnetIds:
- subnet-12345678
Ssh:
KeyName: ec2-key-name
Iam:
InstanceRole: arn:aws:iam::aws:role/LoginNodeRole
HeadNode:
InstanceType: t2.micro
Networking:
SubnetId: subnet-12345678
ElasticIp: true # true|false|EIP-id
AdditionalSecurityGroups:
- sg-34567890
- sg-45678901
Proxy:
HttpProxyAddress: https://proxy-address:port
DisableSimultaneousMultithreading: false
Ssh:
KeyName: ec2-key-name
AllowedIps: 1.2.3.4/32
LocalStorage:
RootVolume:
Size: 40
Encrypted: true
VolumeType: gp2
Iops: 100
DeleteOnTermination: true
EphemeralVolume:
MountDir: /test
SharedStorageType: Efs # Ebs
Dcv:
Enabled: true
Port: 8443
AllowedIps: 0.0.0.0/0
CustomActions:
OnNodeStart:
Script: https://test.tgz # s3:// | https://
Args:
- arg1
- arg2
OnNodeConfigured:
Script: https://test.tgz # s3:// | https://
Args:
- arg1
- arg2
OnNodeUpdated:
Script: https://test.tgz # s3:// | https://
Args:
- arg1
- arg2
Iam:
InstanceRole: arn:aws:iam::aws:role/CustomHeadNodeRole
Imds:
Secured: True
Image:
CustomAmi: ami-98765432
Scheduling:
Scheduler: slurm
ScalingStrategy: best-effort
SlurmSettings:
ScaledownIdletime: 10
QueueUpdateStrategy: TERMINATE
Dns:
DisableManagedDns: true
UseEc2Hostnames: true
EnableMemoryBasedScheduling: false
Database:
Uri: test.databaseserver.com #required
UserName: test_admin # required
PasswordSecretArn: arn:aws:secretsmanager:us-east-1:111111111111:secret:Secret-xxxxxxxx-xxxxx # required
DatabaseName: test_database_name
ExternalSlurmdbd:
Host: test.slurmdbd.host #required
Port: 6819
SlurmQueues:
- Name: queue1
CapacityType: ONDEMAND
Networking:
SubnetIds:
- subnet-12345678
CapacityReservationTarget:
CapacityReservationId: cr-321456cdbd597f551
ComputeResources:
- Name: compute-resource-1
InstanceType: c5.xlarge
CapacityReservationTarget:
CapacityReservationId: cr-54321fcdbd5971234
- Name: compute-resource-2
InstanceType: c4.xlarge
CapacityReservationTarget:
CapacityReservationResourceGroupArn: arn:aws:resource-groups:us-east-1:123456791537:group/MyCRGroup
CustomActions:
OnNodeStart:
Script: https://test.tgz # s3:// | https://
Args:
- arg1
- arg2
OnNodeConfigured:
Script: https://test.tgz # s3:// | https://
Args:
- arg1
- arg2
Iam:
S3Access:
- BucketName: string1
EnableWriteAccess: False
AdditionalIamPolicies:
- Policy: arn:aws:iam::aws:policy/AdministratorAccess
Image:
CustomAmi: ami-12345678
- Name: queue2
ComputeSettings:
LocalStorage:
RootVolume:
Size: 35
Encrypted: true
VolumeType: gp2
Iops: 100
EphemeralVolume:
MountDir: /scratch
Networking:
SubnetIds:
- subnet-12345678
AssignPublicIp: true
SecurityGroups:
- sg-12345678
- sg-12345679
PlacementGroup:
Enabled: true
Id: String
Proxy:
HttpProxyAddress: https://proxy-address:port
ComputeResources:
- Name: compute-resource-1
InstanceType: c4.2xlarge
- Name: compute-resource-2
InstanceType: c5.2xlarge
MinCount: 1
MaxCount: 15
SpotPrice: 1.1
DisableSimultaneousMultithreading: true
Efa:
Enabled: true
GdrSupport: false
Iam:
InstanceProfile: arn:aws:iam::aws:instance-profile/CustomNodeInstanceProfile
Image:
CustomAmi: ami-23456789
SharedStorage:
- MountDir: /my/mount/point1
Name: name1
StorageType: Ebs
EbsSettings:
VolumeType: gp2 # gp2 | gp3 | io1 | io2 | sc1 | st1 | standard
Iops: 100
Size: 150
Encrypted: True
KmsKeyId: String
SnapshotId: snap-12345678
VolumeId: vol-12345678
DeletionPolicy: Retain
- MountDir: /my/mount/point2
Name: name2
StorageType: Efs
EfsSettings:
ThroughputMode: provisioned # bursting | provisioned
ProvisionedThroughput: 1024
EncryptionInTransit: false
IamAuthorization: false
- MountDir: /my/mount/point3
Name: name3
StorageType: FsxLustre
FsxLustreSettings:
StorageCapacity: 3600
DeploymentType: PERSISTENT_1 # PERSISTENT_1 | PERSISTENT_2 | SCRATCH_1 | SCRATCH_2
# ImportedFileChunkSize: 1024 # ImportedFileChunkSize cannot coexist with some of the fields
DataCompressionType: LZ4
# ExportPath: s3://bucket/folder # ExportPath cannot coexist with some of the fields
# ImportPath: s3://bucket # ImportPath cannot coexist with some of the fields
WeeklyMaintenanceStartTime: "1:00:00"
AutomaticBackupRetentionDays: 0
CopyTagsToBackups: true
DailyAutomaticBackupStartTime: 01:03
PerUnitStorageThroughput: 200
# BackupId: backup-fedcba98 # BackupId cannot coexist with some of the fields
KmsKeyId: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
# FileSystemId: fs-12345678123456789 # FileSystemId cannot coexist with other fields
# AutoImportPolicy: NEW # NEW | NEW_CHANGED | NEW_CHANGED_DELETED # AutoImportPolicy cannot coexist with some of the fields
DriveCacheType: READ # READ
StorageType: HDD # HDD | SSD
DataRepositoryAssociations:
- Name: dra1
BatchImportMetaDataOnCreate: false
DataRepositoryPath: s3://bucket/folder1
FileSystemPath: /one
ImportedFileChunkSize: 1024
AutoExportPolicy: [ NEW, CHANGED, DELETED ]
AutoImportPolicy: [ NEW, CHANGED, DELETED ]
- Name: dra2
BatchImportMetaDataOnCreate: false
DataRepositoryPath: s3://bucket/folder2
FileSystemPath: /two
ImportedFileChunkSize: 1024
AutoImportPolicy: [ NEW, DELETED ]
- Name: dra3
BatchImportMetaDataOnCreate: false
DataRepositoryPath: s3://bucket/folder3
FileSystemPath: /three
ImportedFileChunkSize: 1024
AutoExportPolicy: [ DELETED ]
- Name: dra4
BatchImportMetaDataOnCreate: false
DataRepositoryPath: s3://bucket/folder4
FileSystemPath: /four
ImportedFileChunkSize: 1024
- MountDir: /my/mount/point4
Name: name4
StorageType: FsxOpenZfs
FsxOpenZfsSettings:
VolumeId: fsvol-12345678901234567
- MountDir: /my/mount/point5
Name: name5
StorageType: FsxOntap
FsxOntapSettings:
VolumeId: fsvol-12345678901234567
Iam:
Roles:
LambdaFunctionsRole: String # arn:aws:iam::aws:role/CustomResourcesLambdaRole
Monitoring:
DetailedMonitoring: true # false
Logs:
CloudWatch:
Enabled: true # true
RetentionInDays: 30 # 14
DeletionPolicy: Retain
Rotation:
Enabled: true
Dashboards:
CloudWatch:
Enabled: false # true
Alarms:
Enabled: false # true
AdditionalPackages:
IntelSoftware:
IntelHpcPlatform: false
Tags:
- Key: String
Value: String
- Key: two
Value: two22
CustomS3Bucket: String
AdditionalResources: https://template.url
DirectoryService:
DomainName: string # required
DomainAddr: string # required
PasswordSecretArn: arn:aws:secretsmanager:us-east-1:111111111111:secret:Secret-xxxxxxxx-xxxxx # required
DomainReadOnlyUser: string # required
LdapTlsCaCert: string # optional, default is dictated by LDAP client
LdapTlsReqCert: never
LdapAccessFilter: string # optional
GenerateSshKeysForUsers: false
AdditionalSssdConfigs:
parameter_1: value_1
parameter_2: value_2
DevSettings:
ClusterTemplate: https://tests/aws-parallelcluster-template-3.0.tgz
Cookbook:
ChefCookbook: https://tests/aws-parallelcluster-cookbook-3.0.tgz
ExtraChefAttributes: |
{"cluster": {"scheduler_slots": "cores", "slurm_node_reg_mem_percent": 75, "realmemory_to_ec2memory_ratio": 0.95}}
AwsBatchCliPackage: s3://test/aws-parallelcluster-batch-3.0.tgz
NodePackage: s3://test/aws-parallelcluster-node-3.0.tgz
AmiSearchFilters:
Tags:
- Key: tag1
Value: value1
- Key: tag2
Value: value2
Owner: self
Timeouts:
HeadNodeBootstrapTimeout: 1201 # Default 1800 (seconds)
ComputeNodeBootstrapTimeout: 1001 # Default 1800 (seconds)
ComputeStartupTimeMetricEnabled: false
DeploymentSettings:
DisableSudoAccessForDefaultUser: True