Skip to content

Commit

Permalink
[Task Manager] Add config switch around logging at different levels b…
Browse files Browse the repository at this point in the history
…ased on the state (elastic#102804)

* Gate behind a config with warning message that helps users enable

* Update more files

* Fix docs formatting

* Preserve existing functionality

* Add in task type to the message

* Show multiple alert types that are over the threshold

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
  • Loading branch information
chrisronline and kibanamachine committed Jun 24, 2021
1 parent 5a76c84 commit dd072c3
Show file tree
Hide file tree
Showing 13 changed files with 305 additions and 42 deletions.
8 changes: 7 additions & 1 deletion docs/settings/task-manager-settings.asciidoc
Expand Up @@ -29,7 +29,13 @@ Task Manager runs background tasks by polling for work on an interval. You can
| The maximum number of tasks that this Kibana instance will run simultaneously. Defaults to 10.
Starting in 8.0, it will not be possible to set the value greater than 100.

| `xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds`
| `xpack.task_manager.`
`monitored_stats_health_verbose_log.enabled`
| This flag will enable automatic warn and error logging if task manager self detects a performance issue, such as the time between when a task is scheduled to execute and when it actually executes. Defaults to false.

| `xpack.task_manager.`
`monitored_stats_health_verbose_log.`
`warn_delayed_task_start_in_seconds`
| The amount of seconds we allow a task to delay before printing a warning server log. Defaults to 60.
|===

Expand Down
Expand Up @@ -379,7 +379,8 @@ kibana_vars=(
xpack.task_manager.monitored_aggregated_stats_refresh_rate
xpack.task_manager.monitored_stats_required_freshness
xpack.task_manager.monitored_stats_running_average_window
xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_stats_health_verbose_log.enabled
xpack.task_manager.monitored_stats_health_verbose_log.warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_task_execution_thresholds
xpack.task_manager.poll_interval
xpack.task_manager.request_capacity
Expand Down
15 changes: 12 additions & 3 deletions x-pack/plugins/task_manager/server/config.test.ts
Expand Up @@ -18,9 +18,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -67,9 +70,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -103,9 +109,12 @@ describe('config validation', () => {
"max_poll_inactivity_cycles": 10,
"max_workers": 10,
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_health_verbose_log": Object {
"enabled": false,
"warn_delayed_task_start_in_seconds": 60,
},
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {
"alerting:always-fires": Object {
Expand Down
9 changes: 6 additions & 3 deletions x-pack/plugins/task_manager/server/config.ts
Expand Up @@ -110,9 +110,12 @@ export const configSchema = schema.object(
defaultValue: {},
}),
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
monitored_stats_warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
monitored_stats_health_verbose_log: schema.object({
enabled: schema.boolean({ defaultValue: false }),
/* The amount of seconds we allow a task to delay before printing a warning server log */
warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
}),
},
{
Expand Down
Expand Up @@ -37,7 +37,10 @@ describe('managed configuration', () => {
version_conflict_threshold: 80,
max_poll_inactivity_cycles: 10,
monitored_aggregated_stats_refresh_rate: 60000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
monitored_stats_required_freshness: 4000,
monitored_stats_running_average_window: 50,
request_capacity: 1000,
Expand Down
191 changes: 181 additions & 10 deletions x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts
Expand Up @@ -10,7 +10,7 @@ import { configSchema, TaskManagerConfig } from '../config';
import { HealthStatus } from '../monitoring';
import { TaskPersistence } from '../monitoring/task_run_statistics';
import { MonitoredHealth } from '../routes/health';
import { logHealthMetrics } from './log_health_metrics';
import { logHealthMetrics, resetLastLogLevel } from './log_health_metrics';
import { Logger } from '../../../../../src/core/server';

jest.mock('./calculate_health_status', () => ({
Expand All @@ -20,12 +20,110 @@ jest.mock('./calculate_health_status', () => ({
describe('logHealthMetrics', () => {
afterEach(() => {
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
// Reset the last state by running through this as OK
// (calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
resetLastLogLevel();
(calculateHealthStatus as jest.Mock<HealthStatus>).mockReset();
});

it('should log a warning message to enable verbose logging when the status goes from OK to Warning/Error', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');

// We must change from OK to Warning
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
logHealthMetrics(health, logger, config);
// We must change from OK to Error
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
logHealthMetrics(health, logger, config);

expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`
);
expect((logger as jest.Mocked<Logger>).warn.mock.calls[1][0] as string).toBe(
`Detected potential performance issue with Task Manager. Set 'xpack.task_manager.monitored_stats_health_verbose_log.enabled: true' in your Kibana.yml to enable debug logging`
);
});

it('should not log a warning message to enable verbose logging when the status goes from Warning to OK', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');

// We must change from Warning to OK
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(
() => HealthStatus.Warning
);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn).not.toHaveBeenCalled();
});

it('should not log a warning message to enable verbose logging when the status goes from Error to OK', () => {
// console.log('start', getLastLogLevel());
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');

// We must change from Error to OK
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.Error);
logHealthMetrics(health, logger, config);
(calculateHealthStatus as jest.Mock<HealthStatus>).mockImplementation(() => HealthStatus.OK);
logHealthMetrics(health, logger, config);
expect((logger as jest.Mocked<Logger>).warn).not.toHaveBeenCalled();
});

it('should log as debug if status is OK', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();

logHealthMetrics(health, logger, config);

const firstDebug = JSON.parse(
(logger as jest.Mocked<Logger>).debug.mock.calls[0][0].replace('Latest Monitored Stats: ', '')
);
expect(firstDebug).toMatchObject(health);
});

it('should log as debug if status is OK even if not enabled', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: false,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();

Expand All @@ -40,7 +138,10 @@ describe('logHealthMetrics', () => {
it('should log as warn if status is Warn', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
Expand All @@ -62,7 +163,10 @@ describe('logHealthMetrics', () => {
it('should log as error if status is Error', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth();
const { calculateHealthStatus } = jest.requireMock('./calculate_health_status');
Expand All @@ -79,15 +183,26 @@ describe('logHealthMetrics', () => {
expect(logMessage).toMatchObject(health);
});

it('should log as warn if drift exceeds the threshold', () => {
it('should log as warn if drift exceeds the threshold for a single alert type', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
runtime: {
value: {
drift_by_type: {
'taskType:test': {
p99: 60000,
},
'taskType:test2': {
p99: 60000 - 1,
},
},
drift: {
p99: 60000,
},
Expand All @@ -99,7 +214,50 @@ describe('logHealthMetrics', () => {
logHealthMetrics(health, logger, config);

expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected delay task start of 60s (which exceeds configured value of 60s)`
`Detected delay task start of 60s for task(s) \"taskType:test\" (which exceeds configured value of 60s)`
);

const secondMessage = JSON.parse(
((logger as jest.Mocked<Logger>).warn.mock.calls[1][0] as string).replace(
`Latest Monitored Stats: `,
''
)
);
expect(secondMessage).toMatchObject(health);
});

it('should log as warn if drift exceeds the threshold for multiple alert types', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
runtime: {
value: {
drift_by_type: {
'taskType:test': {
p99: 60000,
},
'taskType:test2': {
p99: 60000,
},
},
drift: {
p99: 60000,
},
},
},
},
});

logHealthMetrics(health, logger, config);

expect((logger as jest.Mocked<Logger>).warn.mock.calls[0][0] as string).toBe(
`Detected delay task start of 60s for task(s) \"taskType:test, taskType:test2\" (which exceeds configured value of 60s)`
);

const secondMessage = JSON.parse(
Expand All @@ -114,7 +272,10 @@ describe('logHealthMetrics', () => {
it('should log as debug if there are no stats', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = {
id: '1',
Expand All @@ -135,7 +296,10 @@ describe('logHealthMetrics', () => {
it('should ignore capacity estimation status', () => {
const logger = loggingSystemMock.create().get();
const config = getTaskManagerConfig({
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_health_verbose_log: {
enabled: true,
warn_delayed_task_start_in_seconds: 60,
},
});
const health = getMockMonitoredHealth({
stats: {
Expand Down Expand Up @@ -213,7 +377,14 @@ function getMockMonitoredHealth(overrides = {}): MonitoredHealth {
p95: 2500,
p99: 3000,
},
drift_by_type: {},
drift_by_type: {
'taskType:test': {
p50: 1000,
p90: 2000,
p95: 2500,
p99: 3000,
},
},
load: {
p50: 1000,
p90: 2000,
Expand Down

0 comments on commit dd072c3

Please sign in to comment.