Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.flink.kubernetes.operator.controller.FlinkResourceContext;
import org.apache.flink.kubernetes.operator.metrics.KubernetesResourceMetricGroup;
import org.apache.flink.kubernetes.operator.reconciler.deployment.JobAutoScaler;
import org.apache.flink.kubernetes.operator.utils.EventRecorder;
import org.apache.flink.runtime.jobgraph.JobVertexID;

import io.fabric8.kubernetes.client.KubernetesClient;
Expand All @@ -48,6 +49,7 @@ public class JobAutoScalerImpl implements JobAutoScaler {
private final ScalingMetricCollector metricsCollector;
private final ScalingMetricEvaluator evaluator;
private final ScalingExecutor scalingExecutor;
private final EventRecorder eventRecorder;

private final Map<ResourceID, Map<JobVertexID, Map<ScalingMetric, EvaluatedScalingMetric>>>
lastEvaluatedMetrics = new ConcurrentHashMap<>();
Expand All @@ -57,13 +59,13 @@ public JobAutoScalerImpl(
KubernetesClient kubernetesClient,
ScalingMetricCollector metricsCollector,
ScalingMetricEvaluator evaluator,
ScalingExecutor scalingExecutor) {

ScalingExecutor scalingExecutor,
EventRecorder eventRecorder) {
this.kubernetesClient = kubernetesClient;

this.metricsCollector = metricsCollector;
this.evaluator = evaluator;
this.scalingExecutor = scalingExecutor;
this.eventRecorder = eventRecorder;
}

@Override
Expand All @@ -81,17 +83,17 @@ public boolean scale(FlinkResourceContext<? extends AbstractFlinkResource<?, ?>>
var conf = ctx.getObserveConfig();
var resource = ctx.getResource();

if (resource.getSpec().getJob() == null || !conf.getBoolean(AUTOSCALER_ENABLED)) {
LOG.info("Job autoscaler is disabled");
return false;
}
try {
if (resource.getSpec().getJob() == null || !conf.getBoolean(AUTOSCALER_ENABLED)) {
LOG.info("Job autoscaler is disabled");
return false;
}

if (!resource.getStatus().getJobStatus().getState().equals(JobStatus.RUNNING.name())) {
LOG.info("Job autoscaler is waiting for RUNNING job state");
return false;
}
if (!resource.getStatus().getJobStatus().getState().equals(JobStatus.RUNNING.name())) {
LOG.info("Job autoscaler is waiting for RUNNING job state");
return false;
}

try {
var autoScalerInfo = AutoScalerInfo.forResource(resource, kubernetesClient);

var collectedMetrics =
Expand All @@ -113,8 +115,14 @@ public boolean scale(FlinkResourceContext<? extends AbstractFlinkResource<?, ?>>
scalingExecutor.scaleResource(resource, autoScalerInfo, conf, evaluatedMetrics);
autoScalerInfo.replaceInKubernetes(kubernetesClient);
return specAdjusted;
} catch (Exception e) {
} catch (Throwable e) {
LOG.error("Error while scaling resource", e);
eventRecorder.triggerEvent(
resource,
EventRecorder.Type.Warning,
EventRecorder.Reason.AutoscalerError,
EventRecorder.Component.Operator,
e.getMessage());
return false;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public JobAutoScaler create(KubernetesClient kubernetesClient, EventRecorder eve
kubernetesClient,
new RestApiMetricsCollector(),
new ScalingMetricEvaluator(),
new ScalingExecutor(kubernetesClient, eventRecorder));
new ScalingExecutor(kubernetesClient, eventRecorder),
eventRecorder);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ public static void computeDataRateMetrics(
scalingMetrics.put(ScalingMetric.CURRENT_PROCESSING_RATE, numRecordsInPerSecond);
} else {
LOG.error("Cannot compute true processing rate without numRecordsInPerSecond");
scalingMetrics.put(ScalingMetric.TRUE_PROCESSING_RATE, Double.NaN);
scalingMetrics.put(ScalingMetric.CURRENT_PROCESSING_RATE, Double.NaN);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

/** Test for scaling metrics collection logic. */
@EnableKubernetesMockClient(crud = true)
Expand All @@ -68,6 +69,8 @@ public class BacklogBasedScalingTest extends OperatorTestBase {

private JobAutoScalerImpl autoscaler;

private EventCollector eventCollector = new EventCollector();

@BeforeEach
public void setup() {
evaluator = new ScalingMetricEvaluator();
Expand Down Expand Up @@ -108,7 +111,11 @@ public void setup() {

autoscaler =
new JobAutoScalerImpl(
kubernetesClient, metricsCollector, evaluator, scalingExecutor);
kubernetesClient,
metricsCollector,
evaluator,
scalingExecutor,
new EventRecorder(kubernetesClient, eventCollector));

// Reset custom window size to default
metricsCollector.setTestMetricWindowSize(null);
Expand Down Expand Up @@ -373,6 +380,20 @@ public void testMetricsPersistedAfterRedeploy() {
assertFalse(AutoScalerInfo.forResource(app, kubernetesClient).getMetricHistory().isEmpty());
}

@Test
public void testEventOnError() {
// Invalid config
app.getSpec()
.getFlinkConfiguration()
.put("kubernetes.operator.job.autoscaler.enabled", "3");
autoscaler.scale(getResourceContext(app, createAutoscalerTestContext()));

var event = eventCollector.events.poll();
assertTrue(eventCollector.events.isEmpty());
assertEquals(EventRecorder.Reason.AutoscalerError.toString(), event.getReason());
assertTrue(event.getMessage().startsWith("Could not parse"));
}

private void redeployJob(Instant now) {
app.getStatus().getJobStatus().setUpdateTime(String.valueOf(now.toEpochMilli()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ public enum Reason {
RecoverDeployment,
RestartUnhealthyJob,
ScalingReport,
IneffectiveScaling
IneffectiveScaling,
AutoscalerError
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ public void accept(Event event) {
Assertions.assertNotNull(event);
Assertions.assertEquals(eventConsumed, event);
Assertions.assertEquals(2, event.getCount());

Assertions.assertTrue(
EventUtils.createOrUpdateEvent(
kubernetesClient,
flinkApp,
EventRecorder.Type.Warning,
reason,
null,
EventRecorder.Component.Operator,
consumer));
}

@Test
Expand Down