Skip to content

Commit

Permalink
Spawn multiple checks for failing tasks #169
Browse files Browse the repository at this point in the history
  • Loading branch information
bwalkerl committed Feb 26, 2024
1 parent e2be806 commit 36f2696
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 196 deletions.
120 changes: 60 additions & 60 deletions classes/check/failingtaskcheck.php
Expand Up @@ -16,6 +16,7 @@

namespace tool_heartbeat\check;

use core\check\check;
use core\check\result;

/**
Expand All @@ -28,20 +29,24 @@
* @copyright 2023 Matthew Hilton <matthewhilton@catalyst-au.net>
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
*/
class failingtaskcheck extends overridable_check {
class failingtaskcheck extends check {

/** @var int $warnthreshold Threshold in minutes after which should warn about tasks failing **/
public $warnthreshold = 1;
public $warnthreshold = 60;

/** @var int $errorthreshold Threshold in minutes after which should error about tasks failing **/
public $errorthreshold = 600;

/** @var \stdClass $task Record of task that is failing **/
private $task;

/**
* Constructor
*/
public function __construct() {
public function __construct($task = null) {
$this->id = 'cronfailingtasks';
$this->name = get_string('checkfailingtaskcheck', 'tool_heartbeat');
$this->task = $task;

$this->actionlink = new \action_link(
new \moodle_url('/admin/tasklogs.php'),
Expand All @@ -55,82 +60,77 @@ public function __construct() {
public function get_result() : result {
global $DB;

// Instead of using task API here, we read directly from the database.
// This stops errors originating from broken tasks.
$scheduledtasks = $DB->get_records_sql("SELECT * FROM {task_scheduled} WHERE faildelay > 0 AND disabled = 0");

foreach ($scheduledtasks as $task) {
$message = "SCHEDULED TASK: {$task->classname} Delay: {$task->faildelay}\n";
$this->add_failing_task($task->classname, $message);
}

// Instead of using task API here, we read directly from the database.
// This stops errors originating from broken tasks, and allows the DB to de-duplicate them.
$adhoctasks = $DB->get_records_sql(" SELECT classname, COUNT(*) count, MAX(faildelay) faildelay, SUM(faildelay) cfaildelay
FROM {task_adhoc}
WHERE faildelay > 0
GROUP BY classname
ORDER BY cfaildelay DESC");

foreach ($adhoctasks as $record) {
// Only add duplicate message if there are more than 1.
$duplicatemsg = $record->count > 1 ? " ({$record->count} duplicates!!!)" : '';
$message = "ADHOC TASK: {$record->classname} Delay: {$record->faildelay} {$duplicatemsg}\n";
$this->add_failing_task($record->classname, $message);
}

$this->triage_failing_tasks();

$taskoutputs = $this->get_overridden_outputs();

// Return OK if no task errors.
$notaskerrors = count($taskoutputs) == 0;
if ($notaskerrors) {
return new result(result::OK, '', '');
if (!isset($this->task)) {
$count = $DB->count_records_sql("SELECT COUNT(*) FROM {task_scheduled} WHERE faildelay = 0 AND disabled = 0");
return new result(result::OK, get_string('checkfailingtaskok', 'tool_heartbeat', $count), '');
}

// Find the largest faildelay out of both adhoc and scheduled tasks.
$alldelays = array_merge(array_column($adhoctasks, 'faildelay'), array_column($scheduledtasks, 'faildelay'));
$maxdelaymins = !empty($alldelays) ? max($alldelays) / 60 : 0;

// Define a simple function to work out what the message should be based on the task outputs.
// Returns the [$summary, $details].
$taskoutputfn = function($faildelaymins) use ($taskoutputs) {
$count = count($taskoutputs);

if ($count == 1) {
// Only a single task is failing, so put it at the top level.
return [$taskoutputs[0], ''];
}

if ($count > 1) {
// More than 1, add a message at the start that indicates how many.
return ["{$count} Moodle tasks reported errors, maximum faildelay > {$faildelaymins} mins", implode("", $taskoutputs)];
}

// There are 0 tasks are failing, default to nothing.
return ['', ''];
};
$maxdelaymins = !empty($this->task->faildelay) ? $this->task->faildelay / 60 : 0;

// Default to ok.
$status = result::OK;
$delay = 0;

// Check if warn - if so then upgrade to warn.
if ($maxdelaymins > $this->warnthreshold) {
$status = result::WARNING;
$delay = $this->warnthreshold;
}

// Check if error - if so then upgrade to error.
if ($maxdelaymins > $this->errorthreshold) {
$status = result::ERROR;
$delay = $this->errorthreshold;
}

list($summary, $details) = $taskoutputfn($delay);
return new result($status, $this->task->message, '');
}

/**
* Get the check reference.
* If this check is on a specific task, use the task classname.
*
* @return string must be globally unique
*/
public function get_ref(): string {
if (!isset($this->task)) {
return parent::get_ref();
}
// Format nicely to use as a query param.
return trim(str_replace('\\', '_', $this->task->classname), '_');
}

return new result($status, nl2br($summary), nl2br($details));
/**
* Gets an array of all failing tasks, stored as \stdClass.
*
* @return array of failing tasks
*/
public static function get_failing_tasks(): array {
GLOBAL $DB;
$tasks = [];

// Instead of using task API here, we read directly from the database.
// This stops errors originating from broken tasks.
$scheduledtasks = $DB->get_records_sql("SELECT * FROM {task_scheduled} WHERE faildelay > 0 AND disabled = 0");

foreach ($scheduledtasks as $task) {
$task->message = "SCHEDULED TASK: {$task->classname} Delay: {$task->faildelay}\n";
$tasks[] = new \tool_heartbeat\check\failingtaskcheck($task);
}

// Instead of using task API here, we read directly from the database.
// This stops errors originating from broken tasks, and allows the DB to de-duplicate them.
$adhoctasks = $DB->get_records_sql(" SELECT classname, COUNT(*) count, MAX(faildelay) faildelay, SUM(faildelay) cfaildelay
FROM {task_adhoc}
WHERE faildelay > 0
GROUP BY classname
ORDER BY cfaildelay DESC");

foreach ($adhoctasks as $record) {
// Only add duplicate message if there are more than 1.
$duplicatemsg = $record->count > 1 ? " ({$record->count} duplicates!!!)" : '';
$record->message = "ADHOC TASK: {$record->classname} Delay: {$record->faildelay} {$duplicatemsg}\n";
$tasks[] = new \tool_heartbeat\check\failingtaskcheck($record);
}
return $tasks;
}
}
135 changes: 0 additions & 135 deletions classes/check/overridable_check.php

This file was deleted.

1 change: 1 addition & 0 deletions lang/en/tool_heartbeat.php
Expand Up @@ -86,6 +86,7 @@
$string['taskconfigbad'] = 'Bad configurations {$a}';
$string['tasklatencyok'] = 'Task latency OK.';
$string['checkfailingtaskcheck'] = 'Failing tasks';
$string['checkfailingtaskok'] = '{$a} tasks running OK.';
$string['checkdirsizes'] = 'CFG->dataroot size';
$string['mute'] = 'Mute';
$string['unmute'] = 'Unmute';
Expand Down
4 changes: 3 additions & 1 deletion lib.php
Expand Up @@ -36,13 +36,15 @@ function tool_heartbeat_before_http_headers() {
* @return array
*/
function tool_heartbeat_status_checks() {
return [
$checks = [
new \tool_heartbeat\check\authcheck(),
new \tool_heartbeat\check\cachecheck(),
new \tool_heartbeat\check\logstorecheck(),
new \tool_heartbeat\check\tasklatencycheck(),
new \tool_heartbeat\check\failingtaskcheck(),
];
$taskchecks = \tool_heartbeat\check\failingtaskcheck::get_failing_tasks();
return array_merge($checks, $taskchecks);
}

/**
Expand Down

0 comments on commit 36f2696

Please sign in to comment.