Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: Add image building job deletion delay #345

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/turing/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func (qty *Quantity) MarshalJSON() ([]byte, error) {

type EngineConfig map[string]interface{}

// EnvironmentConfig is a abridged version of
// EnvironmentConfig is an abridged version of
// https://github.dev/caraml-dev/merlin/blob/98ada0d3aa8de30d73e441d3fd1000fe5d5ac266/api/config/environment.go#L26
// Only requires Name and K8sConfig
// This struct should be removed when Environments API is moved from Merlin to MLP
Expand Down
35 changes: 33 additions & 2 deletions api/turing/imagebuilder/imagebuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,12 @@ const (
type JobStatus int

const (
tickDurationInSeconds = 5
// jobDeletionTimeoutInSeconds is the maximum time to wait for a job to be deleted from a cluster
jobDeletionTimeoutInSeconds = 30
// jobDeletionTickDurationInMilliseconds is the interval at which the API server checks if a job has been deleted
jobDeletionTickDurationInMilliseconds = 100
// jobCompletionTickDurationInSeconds is the interval at which the API server checks if a job has completed
jobCompletionTickDurationInSeconds = 5
// JobStatusActive is the status of the image building job is active
JobStatusActive = JobStatus(iota)
// JobStatusFailed is when the image building job has failed
Expand Down Expand Up @@ -164,6 +169,11 @@ func (ib *imageBuilder) BuildImage(request BuildImageRequest) (string, error) {
return "", ErrDeleteFailedJob
}

err = ib.waitForJobToBeDeleted(job)
if err != nil {
return "", ErrDeleteFailedJob
}

job, err = ib.createKanikoJob(kanikoJobName, imageRef, request.ArtifactURI, request.BuildLabels,
request.EnsemblerFolder, request.BaseImageRefTag)
if err != nil {
Expand All @@ -183,7 +193,7 @@ func (ib *imageBuilder) BuildImage(request BuildImageRequest) (string, error) {

func (ib *imageBuilder) waitForJobToFinish(job *apibatchv1.Job) error {
timeout := time.After(ib.imageBuildingConfig.BuildTimeoutDuration)
ticker := time.NewTicker(time.Second * tickDurationInSeconds)
ticker := time.NewTicker(time.Second * jobCompletionTickDurationInSeconds)

for {
select {
Expand All @@ -207,6 +217,27 @@ func (ib *imageBuilder) waitForJobToFinish(job *apibatchv1.Job) error {
}
}

func (ib *imageBuilder) waitForJobToBeDeleted(job *apibatchv1.Job) error {
timeout := time.After(time.Second * jobDeletionTimeoutInSeconds)
ticker := time.NewTicker(time.Millisecond * jobDeletionTickDurationInMilliseconds)

for {
select {
case <-timeout:
return ErrDeleteFailedJob
case <-ticker.C:
_, err := ib.clusterController.GetJob(context.Background(), ib.imageBuildingConfig.BuildNamespace, job.Name)
if err != nil {
if kerrors.IsNotFound(err) {
return nil
}
log.Errorf("unable to get job status for job %s: %v", job.Name, err)
return ErrDeleteFailedJob
}
}
}
}

func (ib *imageBuilder) createKanikoJob(
kanikoJobName string,
imageRef string,
Expand Down
32 changes: 30 additions & 2 deletions api/turing/imagebuilder/imagebuilder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,21 @@ func TestBuildPyFuncEnsemblerJobImage(t *testing.T) {
nil,
).Once()

// Second time it's called
// Second time GetJob is called
ctlr.On(
"GetJob",
mock.Anything,
mock.Anything,
mock.Anything,
).Return(
nil,
k8serrors.NewNotFound(
schema.GroupResource{},
fmt.Sprintf("batch-%s-%s-%d-%s", projectName, modelName, modelVersion, runID[:5]),
),
).Once()

// Third time it's called
ctlr.On(
"GetJob",
mock.Anything,
Expand Down Expand Up @@ -492,7 +506,21 @@ func TestBuildPyFuncEnsemblerServiceImage(t *testing.T) {
nil,
).Once()

// Second time it's called
// Second time GetJob is called
ctlr.On(
"GetJob",
mock.Anything,
mock.Anything,
mock.Anything,
).Return(
nil,
k8serrors.NewNotFound(
schema.GroupResource{},
fmt.Sprintf("service-builder-%s-%s-%d-%s", projectName, modelName, modelVersion, runID[:5]),
),
).Once()

// Third time it's called
ctlr.On(
"GetJob",
mock.Anything,
Expand Down