Skip to content

Commit

Permalink
Update Terraform
Browse files Browse the repository at this point in the history
  • Loading branch information
Austin Byers committed Apr 14, 2018
1 parent c3235ba commit 1a00175
Show file tree
Hide file tree
Showing 9 changed files with 377 additions and 188 deletions.
95 changes: 81 additions & 14 deletions terraform/cloudwatch_dashboard.tf
Expand Up @@ -61,46 +61,46 @@ EOF
}
EOF

sqs = <<EOF
sqs_analyzer = <<EOF
{
"type": "metric",
"width": 12,
"properties": {
"title": "SQS: ${aws_sqs_queue.s3_object_queue.name}",
"title": "SQS: ${aws_sqs_queue.analyzer_queue.name}",
"region": "${var.aws_region}",
"stat": "Sum",
"metrics": [
["AWS/SQS", "NumberOfMessagesSent", "QueueName", "${aws_sqs_queue.s3_object_queue.name}"],
["AWS/SQS", "NumberOfMessagesSent", "QueueName", "${aws_sqs_queue.analyzer_queue.name}"],
[".", "NumberOfMessagesReceived", ".", "."],
[".", "ApproximateNumberOfMessagesVisible", ".", ".", {"stat": "Average"}]
]
}
}
EOF

sqs_age = <<EOF
sqs_analyzer_age = <<EOF
{
"type": "metric",
"width": 12,
"properties": {
"title": "SQS Age Of Oldest Message (Seconds)",
"title": "Analyzer SQS - Age Of Oldest Message (Seconds)",
"region": "${var.aws_region}",
"stat": "Average",
"metrics": [
[
"AWS/SQS", "ApproximateAgeOfOldestMessage",
"QueueName", "${aws_sqs_queue.s3_object_queue.name}"
"QueueName", "${aws_sqs_queue.analyzer_queue.name}"
]
],
"annotations": {
"horizontal": [
{
"label": "Max",
"value": ${aws_sqs_queue.s3_object_queue.message_retention_seconds}
"value": ${aws_sqs_queue.analyzer_queue.message_retention_seconds}
},
{
"label": "Alarm",
"value": ${aws_cloudwatch_metric_alarm.sqs_age.threshold}
"value": ${aws_cloudwatch_metric_alarm.analyzer_sqs_age.threshold}
}
]
}
Expand All @@ -109,9 +109,57 @@ EOF
EOF

// Due to https://github.com/hashicorp/terraform/issues/11574, both ternary branches are always
// computed. This means we have to build the downloader name explicitly instead of referencing
// the (possibly non-existent) downloader module.
downloader_function_name = "${var.name_prefix}_binaryalert_downloader"
// computed, so we have to use this special idiom (same as modules/lambda/outputs.tf).
downloader_function_name = "${module.binaryalert_downloader.function_name}"

downloader_queue_name = "${element(concat(aws_sqs_queue.downloader_queue.*.name, list("")), 0)}"

sqs_downloader = <<EOF
{
"type": "metric",
"width": 12,
"properties": {
"title": "SQS: ${local.downloader_queue_name}",
"region": "${var.aws_region}",
"stat": "Sum",
"metrics": [
["AWS/SQS", "NumberOfMessagesSent", "QueueName", "${local.downloader_queue_name}"],
[".", "NumberOfMessagesReceived", ".", "."],
[".", "ApproximateNumberOfMessagesVisible", ".", ".", {"stat": "Average"}]
]
}
}
EOF

sqs_downloader_age = <<EOF
{
"type": "metric",
"width": 12,
"properties": {
"title": "Downloader SQS - Age Of Oldest Message (Seconds)",
"region": "${var.aws_region}",
"stat": "Average",
"metrics": [
[
"AWS/SQS", "ApproximateAgeOfOldestMessage",
"QueueName", "${local.downloader_queue_name}"
]
],
"annotations": {
"horizontal": [
{
"label": "Max",
"value": "${element(concat(aws_sqs_queue.downloader_queue.*.message_retention_seconds, list("")), 0)}"
},
{
"label": "Alarm",
"value": "${element(concat(aws_cloudwatch_metric_alarm.downloader_sqs_age.*.threshold, list("")), 0)}"
}
]
}
}
}
EOF

downloader = <<EOF
,[".", ".", ".", "${local.downloader_function_name}", {"label": "Downloader"}]
Expand Down Expand Up @@ -275,21 +323,40 @@ EOF
}
EOF

dashboard_body = <<EOF
dashboard_body_without_downloader = <<EOF
{
"widgets": [
${local.s3_bucket_stats}, ${local.yara_rules},
${local.analyzed_binaries}, ${local.sns_publications},
${local.sqs}, ${local.sqs_age},
${local.sqs_analyzer}, ${local.sqs_analyzer_age},
${local.lambda_invocations}, ${local.max_lambda_duration},
${local.lambda_errors}, ${local.lambda_throttles},
${local.s3_download_latency}, ${local.log_bytes}
]
}
EOF

dashboard_body_with_downloader = <<EOF
{
"widgets": [
${local.s3_bucket_stats}, ${local.yara_rules},
${local.analyzed_binaries}, ${local.sns_publications},
${local.sqs_analyzer}, ${local.sqs_analyzer_age},
${local.sqs_downloader}, ${local.sqs_downloader_age},
${local.lambda_invocations}, ${local.max_lambda_duration},
${local.lambda_errors}, ${local.lambda_throttles},
${local.s3_download_latency}, ${local.log_bytes}
]
}
EOF

dashboard_body = "${var.enable_carbon_black_downloader == 1 ? local.dashboard_body_with_downloader : local.dashboard_body_without_downloader}"
}

resource "aws_cloudwatch_dashboard" "binaryalert" {
dashboard_name = "BinaryAlert"
dashboard_body = "${local.dashboard_body}"

// Terraform automatically converts numbers to strings when putting them in a list.
// We have to strip quotes around numbers, so that {"value": "123"} turns into {"value": 123}
dashboard_body = "${replace(local.dashboard_body, "/\"([0-9]+)\"/", "$1")}"
}
65 changes: 59 additions & 6 deletions terraform/cloudwatch_metric_alarm.tf
Expand Up @@ -6,7 +6,7 @@ resource "aws_cloudwatch_metric_alarm" "batch_enqueue_errors" {

alarm_description = <<EOF
${module.binaryalert_batcher.function_name} failed to enqueue one or more S3 keys into the SQS queue
${aws_sqs_queue.s3_object_queue.arn}.
${aws_sqs_queue.analyzer_queue.arn}.
- Check the batcher CloudWatch logs.
- SQS may be down.
- Once the problem has been resolved, re-execute the batcher (`manage.py analyze_all`) to analyze
Expand Down Expand Up @@ -47,12 +47,12 @@ EOF
insufficient_data_actions = ["${aws_sns_topic.metric_alarms.arn}"]
}

// The SQS queue is falling behind.
resource "aws_cloudwatch_metric_alarm" "sqs_age" {
alarm_name = "${aws_sqs_queue.s3_object_queue.name}_old_age"
// The analyzer SQS queue is falling behind.
resource "aws_cloudwatch_metric_alarm" "analyzer_sqs_age" {
alarm_name = "${aws_sqs_queue.analyzer_queue.name}_old_age"

alarm_description = <<EOF
The queue ${aws_sqs_queue.s3_object_queue.name} is falling behind and items are growing old.
The queue ${aws_sqs_queue.analyzer_queue.name} is falling behind and items are growing old.
This can sometimes happen during a batch analysis of the entire bucket (e.g. after a deploy).
- If the SQS age is growing unbounded ("up and to the right"), either the analyzers are down or
they are unable to pull from SQS. Check the analyzer logs.
Expand All @@ -65,7 +65,7 @@ EOF
statistic = "Minimum"

dimensions = {
QueueName = "${aws_sqs_queue.s3_object_queue.name}"
QueueName = "${aws_sqs_queue.analyzer_queue.name}"
}

// The queue is consistently more than 30 minutes behind.
Expand All @@ -77,6 +77,59 @@ EOF
insufficient_data_actions = ["${aws_sns_topic.metric_alarms.arn}"]
}

// The downloader SQS queue is falling behind.
resource "aws_cloudwatch_metric_alarm" "downloader_sqs_age" {
count = "${var.enable_carbon_black_downloader}"
alarm_name = "${aws_sqs_queue.downloader_queue.name}_old_age"

alarm_description = <<EOF
The queue ${aws_sqs_queue.downloader_queue.name} is falling behind and items are growing old.
Make sure the dispatcher is invoking the downloader, and that the downloader is running correctly.
EOF

namespace = "AWS/SQS"
metric_name = "ApproximateAgeOfOldestMessage"
statistic = "Minimum"

dimensions = {
QueueName = "${aws_sqs_queue.downloader_queue.name}"
}

// The queue is consistently more than 30 minutes behind.
comparison_operator = "GreaterThanThreshold"
threshold = 1800
period = 60
evaluation_periods = 15
alarm_actions = ["${aws_sns_topic.metric_alarms.arn}"]
insufficient_data_actions = ["${aws_sns_topic.metric_alarms.arn}"]
}

// A message was delivered to the dead letter queue (this only happens from the downloader).
resource "aws_cloudwatch_metric_alarm" "dlq_message_received" {
count = "${var.enable_carbon_black_downloader}"
alarm_name = "${aws_sqs_queue.dead_letter_queue.name}_message_received"

alarm_description = <<EOF
An SQS message permanently failed to be processed by the downloader and was delivered to the
dead-letter-queue. From the SQS console, manually view the failed message in
${aws_sqs_queue.dead_letter_queue.name}.
EOF

namespace = "AWS/SQS"
metric_name = "NumberOfMessagesReceived"
statistic = "Sum"

dimensions = {
QueueName = "${aws_sqs_queue.dead_letter_queue.name}"
}

comparison_operator = "GreaterThanThreshold"
threshold = 0
period = 60
evaluation_periods = 1
alarm_actions = ["${aws_sns_topic.metric_alarms.arn}"]
}

// There are very few YARA rules.
resource "aws_cloudwatch_metric_alarm" "yara_rules" {
alarm_name = "${module.binaryalert_analyzer.function_name}_too_few_yara_rules"
Expand Down
91 changes: 43 additions & 48 deletions terraform/lambda.tf
@@ -1,33 +1,32 @@
// Create the CarbonBlack downloading Lambda function.
module "binaryalert_downloader" {
enabled = "${var.enable_carbon_black_downloader}"

// Create the analyzer Lambda function.
module "binaryalert_analyzer" {
source = "modules/lambda"
function_name = "${var.name_prefix}_binaryalert_downloader"
description = "Copies binaries from CarbonBlack into the BinaryAlert S3 bucket"
function_name = "${var.name_prefix}_binaryalert_analyzer"
description = "Analyze a binary with a set of YARA rules"
base_policy_arn = "${aws_iam_policy.base_policy.arn}"
handler = "main.download_lambda_handler"
memory_size_mb = "${var.lambda_download_memory_mb}"
timeout_sec = "${var.lambda_download_timeout_sec}"
filename = "lambda_downloader.zip"
handler = "main.analyze_lambda_handler"
memory_size_mb = "${var.lambda_analyze_memory_mb}"
timeout_sec = "${var.lambda_analyze_timeout_sec}"
filename = "lambda_analyzer.zip"

environment_variables = {
CARBON_BLACK_URL = "${var.carbon_black_url}"
ENCRYPTED_CARBON_BLACK_API_TOKEN = "${var.encrypted_carbon_black_api_token}"
TARGET_S3_BUCKET = "${aws_s3_bucket.binaryalert_binaries.id}"
YARA_MATCHES_DYNAMO_TABLE_NAME = "${aws_dynamodb_table.binaryalert_yara_matches.name}"
YARA_ALERTS_SNS_TOPIC_ARN = "${aws_sns_topic.yara_match_alerts.arn}"
}

log_retention_days = "${var.lambda_log_retention_days}"
tagged_name = "${var.tagged_name}"

// During batch operations, the analyzer will have a high error rate because of S3 latency.
alarm_errors_help = <<EOF
The downloader often times out while waiting for CarbonBlack to process the binary.
- If there are a large number of binaries being analyzed right now, this alarm should resolve
itself once the spike subsides.
- If this error persists, start troubleshooting the downloader logs.
If (a) the number of errors is not growing unbounded,
(b) the errors are correlated with a rise in S3 download latency, and
(c) the batcher is currently running (e.g. after a deploy),
then you can resolve this alert (and consider increasing the threshold for this alarm).
Otherwise, there is an unknown problem with the analyzers (which may still be related to S3).
EOF

alarm_errors_threshold = 500
alarm_errors_threshold = 50
alarm_errors_interval_secs = 300
alarm_sns_arns = ["${aws_sns_topic.metric_alarms.arn}"]
}
Expand All @@ -46,10 +45,9 @@ module "binaryalert_batcher" {
environment_variables = {
BATCH_LAMBDA_NAME = "${var.name_prefix}_binaryalert_batcher"
BATCH_LAMBDA_QUALIFIER = "Production"
OBJECT_PREFIX = ""
OBJECTS_PER_MESSAGE = "${var.lambda_batch_objects_per_message}"
S3_BUCKET_NAME = "${aws_s3_bucket.binaryalert_binaries.id}"
SQS_QUEUE_URL = "${aws_sqs_queue.s3_object_queue.id}"
SQS_QUEUE_URL = "${aws_sqs_queue.analyzer_queue.id}"
}

log_retention_days = "${var.lambda_log_retention_days}"
Expand All @@ -69,10 +67,17 @@ module "binaryalert_dispatcher" {
filename = "lambda_dispatcher.zip"

environment_variables = {
ANALYZE_LAMBDA_NAME = "${module.binaryalert_analyzer.function_name}"
ANALYZE_LAMBDA_QUALIFIER = "${module.binaryalert_analyzer.alias_name}"
MAX_DISPATCHES = "${var.lambda_dispatch_limit}"
SQS_QUEUE_URL = "${aws_sqs_queue.s3_object_queue.id}"
SQS_QUEUE_URLS = "${
var.enable_carbon_black_downloader == 1 ?
format("%s,%s", aws_sqs_queue.analyzer_queue.id,
element(concat(aws_sqs_queue.downloader_queue.*.id, list("")), 0)) :
aws_sqs_queue.analyzer_queue.id}"

LAMBDA_TARGETS = "${
var.enable_carbon_black_downloader == 1 ?
format("%s:%s,%s:%s", module.binaryalert_analyzer.function_name, module.binaryalert_analyzer.alias_name,
module.binaryalert_downloader.function_name, module.binaryalert_downloader.alias_name) :
format("%s:%s", module.binaryalert_analyzer.function_name, module.binaryalert_analyzer.alias_name)}"
}

log_retention_days = "${var.lambda_log_retention_days}"
Expand All @@ -90,36 +95,26 @@ resource "aws_lambda_permission" "allow_cloudwatch_to_invoke_dispatch" {
qualifier = "${module.binaryalert_dispatcher.alias_name}"
}

// Create the analyzer Lambda function.
module "binaryalert_analyzer" {
// Create the CarbonBlack downloading Lambda function.
module "binaryalert_downloader" {
enabled = "${var.enable_carbon_black_downloader}"

source = "modules/lambda"
function_name = "${var.name_prefix}_binaryalert_analyzer"
description = "Analyze a binary with a set of YARA rules"
function_name = "${var.name_prefix}_binaryalert_downloader"
description = "Copies binaries from CarbonBlack into the BinaryAlert S3 bucket"
base_policy_arn = "${aws_iam_policy.base_policy.arn}"
handler = "main.analyze_lambda_handler"
memory_size_mb = "${var.lambda_analyze_memory_mb}"
timeout_sec = "${var.lambda_analyze_timeout_sec}"
filename = "lambda_analyzer.zip"
handler = "main.download_lambda_handler"
memory_size_mb = "${var.lambda_download_memory_mb}"
timeout_sec = "${var.lambda_download_timeout_sec}"
filename = "lambda_downloader.zip"

environment_variables = {
SQS_QUEUE_URL = "${aws_sqs_queue.s3_object_queue.id}"
YARA_MATCHES_DYNAMO_TABLE_NAME = "${aws_dynamodb_table.binaryalert_yara_matches.name}"
YARA_ALERTS_SNS_TOPIC_ARN = "${aws_sns_topic.yara_match_alerts.arn}"
CARBON_BLACK_URL = "${var.carbon_black_url}"
ENCRYPTED_CARBON_BLACK_API_TOKEN = "${var.encrypted_carbon_black_api_token}"
TARGET_S3_BUCKET = "${aws_s3_bucket.binaryalert_binaries.id}"
}

log_retention_days = "${var.lambda_log_retention_days}"
tagged_name = "${var.tagged_name}"

// During batch operations, the analyzer will have a high error rate because of S3 latency.
alarm_errors_help = <<EOF
If (a) the number of errors is not growing unbounded,
(b) the errors are correlated with a rise in S3 download latency, and
(c) the batcher is currently running (e.g. after a deploy),
then you can resolve this alert (and consider increasing the threshold for this alarm).
Otherwise, there is an unknown problem with the analyzers (which may still be related to S3).
EOF

alarm_errors_threshold = 50
alarm_errors_interval_secs = 300
alarm_sns_arns = ["${aws_sns_topic.metric_alarms.arn}"]
alarm_sns_arns = ["${aws_sns_topic.metric_alarms.arn}"]
}

0 comments on commit 1a00175

Please sign in to comment.