Skip to content

Commit

Permalink
Merge branch 'master' into avoid_node_on_retry
Browse files Browse the repository at this point in the history
  • Loading branch information
JamesMurkin committed Apr 13, 2023
2 parents 269126b + ec82b5c commit 4f34365
Show file tree
Hide file tree
Showing 18 changed files with 713 additions and 227 deletions.
4 changes: 2 additions & 2 deletions client/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ license = { text = "Apache Software License" }
authors = [{ name = "G-Research Open Source Software", email = "armada@armadaproject.io" }]

[project.optional-dependencies]
format = ["black==23.1.0", "flake8==6.0.0", "pylint==2.16.3"]
format = ["black==23.3.0", "flake8==6.0.0", "pylint==2.17.2"]
docs = ["sphinx", "sphinx-jekyll-builder", "sphinx-toolbox==3.2.0b1"]
test = ["pytest==7.2.1", "coverage>=6.5.0"]
test = ["pytest==7.2.2", "coverage>=6.5.0"]

[build-system]
requires = ["setuptools"]
Expand Down
13 changes: 9 additions & 4 deletions cmd/armadactl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,18 @@ armadactl version [flags]
```bash
armadactl watch [deployment_name] [flags]
```
- getQueueSchedulingReport : This subcommand retrieves a report of the current scheduling status of all queues in the Armada cluster.
- queue-report : This subcommand retrieves a report of the current scheduling status of all queues in the Armada cluster.
```bash
armadactl getQueueSchedulingReport
armadactl queue-report
```
- getJobSchedulingReport : This subcommand retrieves a report of the current scheduling status of all jobs in the Armada cluster.
- job-report : This subcommand retrieves a report of the current scheduling status of all jobs in the Armada cluster.
```bash
armadactl getJobSchedulingReport
armadactl job-report
```

- scheduling-report : This subcommand retrieves a report of the current scheduling status in the Armada cluster.
```bash
armadactl scheduling-report
```

For a full list of subcommands and options, you can run **armadactl --help**.
5 changes: 4 additions & 1 deletion config/executor/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ kubernetes:
maxTerminatedPods: 1000 # Should be lower than kube-controller-managed terminated-pod-gc-threshold (default 12500)
stuckTerminatingPodExpiry: 1m
podKillTimeout: 5m
nodeReservedResourcesPriority: 2000001000 # same priority as system-node-critical
minimumResourcesMarkedAllocatedToNonArmadaPodsPerNode:
cpu: 100m
memory: 50Mi
minimumResourcesMarkedAllocatedToNonArmadaPodsPerNodePriority: 2000001000 # same priority as system-node-critical
podDefaults:
ingress:
hostnameSuffix: "svc"
Expand Down
16 changes: 16 additions & 0 deletions config/jobservice/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,23 @@ grpcPort: 60003
httpPort: 8090
purgeJobSetTime: 1000
subscribeJobSetTime: 100
# databaseType can be either 'postgres' or 'sqlite'
databaseType: "postgres"
# databasePath specifies the location of the back-end
# storage file when using database type 'sqlite'
databasePath: "/var/jobservice.db"
# Connection details when using database type 'postgres'
postgresConfig:
maxOpenConns: 50
maxIdleConns: 10
connMaxLifetime: 30m
connection:
host: postgres
port: 5432
user: postgres
password: psw
dbname: postgres
sslmode: disable
grpc:
keepaliveParams:
maxConnectionIdle: 5m
Expand Down
13 changes: 12 additions & 1 deletion e2e/setup/jobservice.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
databasePath: "/tmp/jobservice.db"
subscribeJobSetTime: 60
purgeJobSetTime: 10000
databaseType: "postgres"
postgresConfig:
maxOpenConns: 50
maxIdleConns: 10
connMaxLifetime: 30m
connection:
host: postgres
port: 5432
user: postgres
password: psw
dbname: postgres
sslmode: disable
apiConnection:
armadaUrl: "server:50051"
forceNoTls: true
Expand Down
1 change: 0 additions & 1 deletion internal/armada/server/lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,6 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
for _, job := range jobs {
nodeIdByJobId[job.Id] = node.Id
}

nodes = append(nodes, node)
}
indexedResources := q.schedulingConfig.IndexedResources
Expand Down
9 changes: 4 additions & 5 deletions internal/executor/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ func setupExecutorApiComponents(
nil,
config.Kubernetes.TrackedNodeLabels,
config.Kubernetes.NodeIdLabel,
config.Kubernetes.NodeReservedResources,
config.Kubernetes.NodeReservedResourcesPriority,
config.Kubernetes.MinimumResourcesMarkedAllocatedToNonArmadaPodsPerNode,
config.Kubernetes.MinimumResourcesMarkedAllocatedToNonArmadaPodsPerNodePriority,
)

eventReporter, stopReporter := reporter.NewJobEventReporter(
Expand Down Expand Up @@ -261,8 +261,8 @@ func setupServerApiComponents(
usageClient,
config.Kubernetes.TrackedNodeLabels,
config.Kubernetes.NodeIdLabel,
config.Kubernetes.NodeReservedResources,
config.Kubernetes.NodeReservedResourcesPriority,
config.Kubernetes.MinimumResourcesMarkedAllocatedToNonArmadaPodsPerNode,
config.Kubernetes.MinimumResourcesMarkedAllocatedToNonArmadaPodsPerNodePriority,
)

jobLeaseService := service.NewJobLeaseService(
Expand All @@ -286,7 +286,6 @@ func setupServerApiComponents(
clusterUtilisationService,
submitter,
etcdHealthMonitor,
config.Kubernetes.NodeReservedResources,
)

jobManager := service.NewJobManager(
Expand Down
17 changes: 9 additions & 8 deletions internal/executor/configuration/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ type KubernetesConfiguration struct {
PodDefaults *PodDefaults
PendingPodChecks *podchecks.Checks
FatalPodSubmissionErrors []string
// NodeReservedResources config is used to factor in reserved resources on each node
// when validating can a job be scheduled on a node during job submit (i.e. factor in resources for daemonset pods)
NodeReservedResources armadaresource.ComputeResources
// NodeReservedResourcesPriority - The priority the reserved resource is reported at
// All pods in kubernetes have a priority - and we report to the Armada API resource for a given priority
// Therefore we also need to set a priority for the reserved resource
NodeReservedResourcesPriority int32
PodKillTimeout time.Duration
// Minimum amount of resources marked as allocated to non-Armada pods on each node.
// I.e., if the total resources allocated to non-Armada pods on some node drops below this value,
// the executor adds a fictional allocation to make up the difference, such that the total is at least this.
// Hence, specifying can ensure that, e.g., if a deamonset pod restarts, those resources are not considered for scheduling.
MinimumResourcesMarkedAllocatedToNonArmadaPodsPerNode armadaresource.ComputeResources
// When adding a fictional allocation to ensure resources allocated to non-Armada pods is at least
// MinimumResourcesMarkedAllocatedToNonArmadaPodsPerNode, those resources are marked allocated at this priority.
MinimumResourcesMarkedAllocatedToNonArmadaPodsPerNodePriority int32
PodKillTimeout time.Duration
}

type EtcdConfiguration struct {
Expand Down
3 changes: 0 additions & 3 deletions internal/executor/service/cluster_allocation.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ type LegacyClusterAllocationService struct {
clusterContext executorContext.ClusterContext
submitter job.Submitter
etcdHealthMonitor healthmonitor.EtcdLimitHealthMonitor
reserved armadaresource.ComputeResources
}

func NewLegacyClusterAllocationService(
Expand All @@ -133,7 +132,6 @@ func NewLegacyClusterAllocationService(
utilisationService utilisation.UtilisationService,
submitter job.Submitter,
etcdHealthMonitor healthmonitor.EtcdLimitHealthMonitor,
reserved armadaresource.ComputeResources,
) *LegacyClusterAllocationService {
return &LegacyClusterAllocationService{
leaseService: leaseService,
Expand All @@ -142,7 +140,6 @@ func NewLegacyClusterAllocationService(
clusterContext: clusterContext,
submitter: submitter,
etcdHealthMonitor: etcdHealthMonitor,
reserved: reserved,
}
}

Expand Down
Loading

0 comments on commit 4f34365

Please sign in to comment.