From 1714b1edf2e4f0e33427cb96ead66ae922c12e57 Mon Sep 17 00:00:00 2001 From: ryandeivert Date: Tue, 31 Mar 2020 13:03:16 -0700 Subject: [PATCH] official release of v3.1.1 (#1215) * bumping version to 3.1.1 * fixing bad links in docs, other touch ups (#1214) * DRY out some code * fix * updating role paths used by scheduled queries (#1216) Co-authored-by: Derek Wang --- docs/source/apps.rst | 2 +- docs/source/config-clusters.rst | 6 +-- docs/source/datasources.rst | 4 +- docs/source/getting-started.rst | 6 +-- docs/source/historical-search.rst | 54 +++++++++++++------ docs/source/rules.rst | 12 +++-- streamalert/__init__.py | 2 +- streamalert/athena_partition_refresh/main.py | 23 +------- streamalert/shared/config.py | 34 ++++++++++++ .../cloudwatch_schedule.tf | 4 ++ .../modules/tf_scheduled_queries/iam_roles.tf | 16 ++++-- .../tf_scheduled_queries/step_function.tf | 4 ++ .../modules/tf_scheduled_queries/versions.tf | 4 -- streamalert_cli/terraform/athena.py | 4 +- .../terraform/scheduled_queries.py | 13 ++--- 15 files changed, 118 insertions(+), 70 deletions(-) delete mode 100644 streamalert_cli/_infrastructure/modules/tf_scheduled_queries/versions.tf diff --git a/docs/source/apps.rst b/docs/source/apps.rst index 60f7c8f6c..d102af7bd 100644 --- a/docs/source/apps.rst +++ b/docs/source/apps.rst @@ -224,7 +224,7 @@ To update an App's credentials, run the the following command: python manage.py app update-auth --cluster --name -This will have you follow a process similar to `configuring a new App `_. +This will have you follow a process similar to `configuring a new App <#configuring-an-app>`_. ******************** diff --git a/docs/source/config-clusters.rst b/docs/source/config-clusters.rst index cbbb54697..bab8b4a4d 100644 --- a/docs/source/config-clusters.rst +++ b/docs/source/config-clusters.rst @@ -6,7 +6,7 @@ Inbound data is directed to one of StreamAlert's *clusters*, each with its own d and classifier function. For many applications, one cluster may be enough. However, adding additional clusters can potentially improve performance. For example, you could have: - * A cluster dedicated to `StreamAlert apps `_ + * A cluster dedicated to `StreamAlert apps `_ * A separate cluster for each of your inbound `Kinesis Data Streams `_ * A separate cluster for data from each environment (prod, staging, corp, etc) @@ -53,7 +53,7 @@ from that source. .. note:: Log schemas are defined in one or more files in the ``conf/schemas`` directory. See - the `Schemas `_ page for more information, or the + the `Schemas `_ page for more information, or the `Example Schemas `_ page for some sample log definitions. Each log in the list of logs instructs StreamAlert's classifier function to attempt @@ -97,7 +97,7 @@ Example .. important:: Any data source log type that is listed must have an associated log definition - within your `schemas `_ definitions. + within your `schemas `_ definitions. Classifier Configuration diff --git a/docs/source/datasources.rst b/docs/source/datasources.rst index 70b2643ba..e9d95497a 100644 --- a/docs/source/datasources.rst +++ b/docs/source/datasources.rst @@ -15,7 +15,7 @@ These services above can accept data from: * Amazon CloudWatch Events * And more -To configure datasources, read `datasource configuration `_ +To configure datasources for a cluster, read `datasource configuration `_ ********* @@ -41,7 +41,7 @@ Example non-AWS use-cases: Amazon Kinesis Data Streams *************************** StreamAlert also utilizes Amazon Kinesis Data Streams for real-time data ingestion and analysis. -By default, StreamAlert creates an Amazon Kinesis Data Stream per `cluster `_. +By default, StreamAlert creates an Amazon Kinesis Data Stream per `cluster `_. Sending to Amazon Kinesis Data Streams diff --git a/docs/source/getting-started.rst b/docs/source/getting-started.rst index 1eb425bc1..ee4781006 100644 --- a/docs/source/getting-started.rst +++ b/docs/source/getting-started.rst @@ -169,7 +169,7 @@ SNS for both sending the log data and receiving the alert, but StreamAlert also .. note:: You will need to click the verification link in your email to activate the subscription. -4. Add the ``streamalert-test-data`` SNS topic as an input to the (default) ``prod`` `cluster `_. +4. Add the ``streamalert-test-data`` SNS topic as an input to the (default) ``prod`` `cluster `_. Open ``conf/clusters/prod.json`` and change the ``streamalert`` module to look like this: .. code-block:: json @@ -189,7 +189,7 @@ Open ``conf/clusters/prod.json`` and change the ``streamalert`` module to look l } } -5. Tell StreamAlert which `log schemas `_ will be sent to this input. +5. Tell StreamAlert which `log schemas `_ will be sent to this input. Open ``conf/clusters/prod.json`` and change the ``data_sources`` section to look like this: .. code-block:: json @@ -284,7 +284,7 @@ dropdown on the left and preview the ``alerts`` table: :target: _images/athena-alerts-search.png (Here, my name prefix is ``testv2``.) If no records are returned, look for errors -in the ``athena_partition_refresh`` function or try invoking it directly. +in the Athena Partition Refresh function or try invoking it directly. And there you have it! Ingested log data is parsed, classified, and scanned by the rules engine. Any resulting alerts are delivered to your configured output(s) within a matter of minutes. diff --git a/docs/source/historical-search.rst b/docs/source/historical-search.rst index cbafac739..c8f45c444 100644 --- a/docs/source/historical-search.rst +++ b/docs/source/historical-search.rst @@ -1,16 +1,32 @@ +################# Historical Search ################# -StreamAlert historical search feature is backed by Amazon S3 and `Athena `_ services. By default, StreamAlert will send all alerts to S3 and those alerts will be searchable in Athena table. StreamAlert users have option to enable historical search feature for data as well. +StreamAlert historical search feature is backed by Amazon S3 and `Athena `_ services. +By default, StreamAlert will send all alerts to S3 and those alerts will be searchable in Athena table. StreamAlert +users have option to enable historical search feature for data as well. + +As of StreamAlert v3.1.0, a new field, ``file_format``, has been added to ``athena_partition_refresh_config`` +in ``conf/lamba.json``, defaulting to ``null``. This field allows users to configure how the data processed +by the Classifier is stored in S3 bucket, either in ``parquet`` or ``json``. -As of StreamAlert v3.1.0, a new field, ``file_format``, has been added to ``athena_partition_refresh_config`` in ``conf/lamba.json``, defaulting to ``null``. This field allows users to configure how the data processed by the Classifier is stored in S3 bucket—either in ``parquet`` or ``json``. Prior to v3.1.0, all data was stored in ``json``. When using this format, Athena's search performance degrades greatly when partition sizes grow. To address this, we've introduce support for ``parquet`` to provide better Athena search performance and cost saving. +Prior to v3.1.0, all data was stored in ``json``. When using this format, Athena's search performance +degrades greatly when partition sizes grow. To address this, we've introduce support for ``parquet`` +to provide better Athena search performance and cost saving. .. note:: - * When upgrading StreamAlert to v3.1.0, it is required to change the default ``file_format`` value to either ``parquet`` or ``json``, otherwise StreamAlert will raise ``MisconfigurationError`` exception when run ``python manage.py build``. - * For existing deployments, ``file_format`` can be set to ``json`` and there will have no change occurred. However, if the ``file_format`` is changed to ``parquet``, all Athena tables need to be created to load ``parquet`` format. The existing JSON data won't be searchable anymore unless we build a separated tables to process data in JSON format. (All data stay in S3 bucket, there is no data loss.). - * For new StreamAlert deployments, it is recommended to set ``file_format`` to ``parquet`` to take the advantage of better Athena search performance and save the cost when scanning data. - * In the future release, the default value of ``file_format`` will change to ``parquet``. So let's change now! + * When upgrading to StreamAlert v3.1.0, you must set the ``file_format`` value to either ``parquet`` + or ``json``, otherwise StreamAlert will raise ``MisconfigurationError`` exception when running + ``python manage.py build``. + * For existing deployments, the ``file_format`` value can be set to ``json`` to retain current + functionality. However, if the ``file_format`` is changed to ``parquet``, new Athena tables will + need to be recreated to load the ``parquet`` format. The existing JSON data won't be searchable + anymore unless we build a separated tables to process data in JSON format. All of the underlying + data remains stored in S3 bucket, there is no data loss. + * For new StreamAlert deployments, it is recommended to set ``file_format`` to ``parquet`` to + take advantage of better Athena search performance and cost savings when scanning data. + * In an upcoming release, the value for ``file_format`` will be set to ``parquet`` by default, so let's change now! ************ Architecture @@ -19,11 +35,13 @@ Architecture .. image:: ../images/historical-search.png :align: left -The pipeline is -* StreamAlert creates an Athena Database, alerts kinesis Firehose and ``alerts`` table during initial deployment -* Optional to create Firehose and Athena tables for data -* S3 events will be sent to SQS to invoke ``athena_partition_refresh`` lambda function to add new partitions when there are new alerts or data saved in S3 bucket via Firehose -* New alerts and data are available for searching via Athena console or SDK +The pipeline is: + + #. StreamAlert creates an Athena Database, alerts kinesis Firehose and ``alerts`` table during initial deployment + #. Optionally create Firehose resources and Athena tables for historical data retention + #. S3 events will be sent to an SQS that is mapped to the Athena Partition Refresh Lambda function + #. The Lambda function adds new partitions when there are new alerts or data saved in S3 bucket via Firehose + #. Alerts, and optionally data, are available for searching via Athena console or the Athena API .. _alerts_search: @@ -31,9 +49,15 @@ The pipeline is Alerts Search ************* -* Review alert Firehose configuration, see :ref:`alerts_firehose_configuration` in ``CONFIGURATION`` session. Athena database and Athena alerts table are created automatically when you first deploy StreamAlert. -* If the ``file_format`` is set to ``parquet``, you can run ``MSCK REPAIR TABLE alerts`` command in the Athena to load all available partitions and then alerts can be searchable. However, using ``MSCK REPAIR`` command can not load new partitions automatically. -* StreamAlert provides a lambda function ``athena_partition_refresh`` to load new partitions to Athena tables once the data arrives in the S3 buckets automatically. Update ``athena_partition_refresh_config`` if necessary. Open ``conf/lambda.json``. See more settings :ref:`configure_athena_partition_refresh_lambda` +* Review the settings for the :ref:`Alerts Firehose Configuration ` and + the :ref:`Athena Partition Refresh` function. Note that + the Athena database and alerts table are created automatically when you first deploy StreamAlert. +* If the ``file_format`` value within the :ref:`Athena Partition Refresh` + function config is set to ``parquet``, you can run the ``MSCK REPAIR TABLE alerts`` command in + Athena to load all available partitions and then alerts can be searchable. Note, however, that the + ``MSCK REPAIR`` command cannot load new partitions automatically. +* StreamAlert includes a Lambda function to automatically add new partitions for Athena tables when + the data arrives in S3. See :ref:`configure_athena_partition_refresh_lambda` .. code-block:: bash @@ -45,7 +69,7 @@ Alerts Search } } -* Deploy athena_partition_refresh lambda function +* Deploy the Athena Partition Refresh Lambda function .. code-block:: bash diff --git a/docs/source/rules.rst b/docs/source/rules.rst index 38e7801b0..69c61f1ad 100644 --- a/docs/source/rules.rst +++ b/docs/source/rules.rst @@ -51,7 +51,7 @@ The simplest possible rule looks like this: return True This rule will be evaluated against all inbound logs that match the ``cloudwatch:events`` schema defined in a schema file in the ``conf/schemas`` directory, i.e ``conf/schemas/cloudwatch.json``. -In this case, *all* CloudWatch events will generate an alert, which will be sent to the `alerts Athena table `_. +In this case, *all* CloudWatch events will generate an alert, which will be sent to the `alerts Athena table `_. Example: Logic & Outputs @@ -70,7 +70,8 @@ Let's modify the rule to page the security team if anyone ever uses AWS root cre and record['detail']['eventType'] != 'AwsServiceEvent') Now, any AWS root account usage is reported to PagerDuty, Slack, and the aforementioned Athena table. -In order for this to work, your `datasources `_ and `outputs `_ must be configured so that: +In order for this to work, your `datasources `_ and +`outputs `_ must be configured so that: * CloudTrail logs are being sent to StreamAlert via CloudWatch events * The ``pagerduty:csirt`` and ``slack:security`` outputs have the proper credentials @@ -187,8 +188,9 @@ The following table provides an overview of each rule option, with more details ``logs`` define the log schema(s) supported by the rule. - Log `sources `_ are defined under the ``data_sources`` field for a cluster defined in ``conf/clusters/.json`` - and their `schemas `_ are defined in one or more files in the ``conf/schemas`` directory. + Log `datasources `_ are defined within the + ``data_sources`` field of a cluster such as ``conf/clusters/.json`` and their + `schemas `_ are defined in one or more files in the ``conf/schemas`` directory. .. note:: @@ -254,7 +256,7 @@ The following table provides an overview of each rule option, with more details .. note:: - The original (unmerged) alert will always be sent to `Athena `_. + The original (unmerged) alert will always be sent to `Athena `_. :dynamic_outputs: diff --git a/streamalert/__init__.py b/streamalert/__init__.py index 3d25a00d1..49f37a67d 100644 --- a/streamalert/__init__.py +++ b/streamalert/__init__.py @@ -1,2 +1,2 @@ """StreamAlert version.""" -__version__ = '3.1.0' +__version__ = '3.1.1' diff --git a/streamalert/athena_partition_refresh/main.py b/streamalert/athena_partition_refresh/main.py index 91acffb28..fb7667bb7 100644 --- a/streamalert/athena_partition_refresh/main.py +++ b/streamalert/athena_partition_refresh/main.py @@ -23,7 +23,7 @@ from streamalert.shared.utils import get_database_name, get_data_file_format from streamalert.shared.athena import AthenaClient -from streamalert.shared.config import firehose_alerts_bucket, firehose_data_bucket, load_config +from streamalert.shared.config import athena_partition_buckets, load_config from streamalert.shared.exceptions import ConfigError from streamalert.shared.logger import get_logger @@ -83,7 +83,7 @@ def __init__(self): ) raise ConfigError(message) - self._athena_buckets = self.buckets_from_config(config) + self._athena_buckets = athena_partition_buckets(config) db_name = get_database_name(config) @@ -97,25 +97,6 @@ def __init__(self): self._create_client(db_name, results_bucket) - @classmethod - def buckets_from_config(cls, config): - """Get the buckets from default buckets and additionally configured ones - - Args: - config (dict): The loaded config from the 'conf/' directory - - Returns: - list: Bucket names for which Athena is enabled - """ - athena_config = config['lambda']['athena_partition_refresh_config'] - data_buckets = athena_config.get('buckets', {}) - data_buckets[firehose_alerts_bucket(config)] = 'alerts' - data_bucket = firehose_data_bucket(config) # Data retention is optional, so check for this - if data_bucket: - data_buckets[data_bucket] = 'data' - - return data_buckets - @classmethod def _create_client(cls, db_name, results_bucket): if cls._ATHENA_CLIENT: diff --git a/streamalert/shared/config.py b/streamalert/shared/config.py index 5ea70c995..150cc5285 100644 --- a/streamalert/shared/config.py +++ b/streamalert/shared/config.py @@ -115,6 +115,40 @@ def firehose_alerts_bucket(config): ) +def athena_partition_buckets(config): + """Get the buckets from default buckets and additionally configured ones + Args: + config (dict): The loaded config from the 'conf/' directory + Returns: + list: Bucket names for which Athena is enabled + """ + athena_config = config['lambda']['athena_partition_refresh_config'] + data_buckets = athena_config.get('buckets', {}) + data_buckets[firehose_alerts_bucket(config)] = 'alerts' + data_bucket = firehose_data_bucket(config) # Data retention is optional, so check for this + if data_bucket: + data_buckets[data_bucket] = 'data' + + return data_buckets + + +def athena_query_results_bucket(config): + """Get the S3 bucket where Athena queries store results to. + + Args: + config (dict): The loaded config + Returns: + str: The name of the S3 bucket. + """ + athena_config = config['lambda']['athena_partition_refresh_config'] + prefix = config['global']['account']['prefix'] + + return athena_config.get( + 'results_bucket', + '{}.streamalert.athena-results'.format(prefix) + ).strip() + + def parse_lambda_arn(function_arn): """Extract info on the current environment from the lambda function ARN diff --git a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/cloudwatch_schedule.tf b/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/cloudwatch_schedule.tf index 8bfc7fee1..87fce10f8 100644 --- a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/cloudwatch_schedule.tf +++ b/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/cloudwatch_schedule.tf @@ -7,6 +7,10 @@ resource "aws_cloudwatch_event_rule" "event" { name = "${var.prefix}_streamalert_scheduled_queries_event_${count.index}" description = var.query_packs[count.index].description schedule_expression = var.query_packs[count.index].schedule_expression + + tags = { + Name = "StreamAlert" + } } resource "aws_cloudwatch_event_target" "run_step_function" { diff --git a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/iam_roles.tf b/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/iam_roles.tf index b308409cc..bce2fab41 100644 --- a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/iam_roles.tf +++ b/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/iam_roles.tf @@ -15,7 +15,7 @@ # Attach additional permissions to the auto-generated Lambda IAM Role resource "aws_iam_role_policy" "lambda_permissions" { - name = "${var.prefix}_streamalert_scheduled_queries_lambda_permissions" + name = "LambdaRequiredPermissions" role = module.scheduled_queries_lambda.role_id policy = data.aws_iam_policy_document.lambda_permissions.json } @@ -97,7 +97,12 @@ data "aws_iam_policy_document" "lambda_permissions" { # Setup the IAM Role for the Step Functions resource "aws_iam_role" "iam_for_step_functions" { name = "${var.prefix}_streamalert_scheduled_queries_state_machines" + path = "/streamalert/" assume_role_policy = data.aws_iam_policy_document.iam_step_function_assume_role.json + + tags = { + Name = "StreamAlert" + } } # Only allow Step Functions to assume this role @@ -116,7 +121,7 @@ data "aws_iam_policy_document" "iam_step_function_assume_role" { # Attach an additional policy to the IAM Role resource "aws_iam_role_policy" "stepfunction_permissions" { - name = "${var.prefix}_streamalert_scheduled_queries_state_machine_permissions" + name = "StepFunctionsInvokeLambda" role = aws_iam_role.iam_for_step_functions.id policy = data.aws_iam_policy_document.stepfunction_permissions.json } @@ -143,7 +148,12 @@ data "aws_iam_policy_document" "stepfunction_permissions" { # Setup the IAM Role resource "aws_iam_role" "iam_for_cloudwatch_schedule" { name = "${var.prefix}_streamalert_scheduled_queries_cloudwatch_schedule" + path = "/streamalert/" assume_role_policy = data.aws_iam_policy_document.iam_cloudwatch_assume_role.json + + tags = { + Name = "StreamAlert" + } } # Only allow cloudwatch to assume this role @@ -162,7 +172,7 @@ data "aws_iam_policy_document" "iam_cloudwatch_assume_role" { # Attach additional permissions to the IAM Role resource "aws_iam_role_policy" "cloudwatch_schedule_permissions" { - name = "${var.prefix}_streamalert_scheduled_queries_cloudwatch_schedule_permissions" + name = "StepFunctionsStartViaCWE" role = aws_iam_role.iam_for_cloudwatch_schedule.id policy = data.aws_iam_policy_document.cloudwatch_schedule_permission.json } diff --git a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/step_function.tf b/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/step_function.tf index 0cfd52b81..3c03917a2 100644 --- a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/step_function.tf +++ b/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/step_function.tf @@ -64,4 +64,8 @@ resource "aws_sfn_state_machine" "state_machine" { } EOF + tags = { + Name = "StreamAlert" + } + } diff --git a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/versions.tf b/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/versions.tf deleted file mode 100644 index ac97c6ac8..000000000 --- a/streamalert_cli/_infrastructure/modules/tf_scheduled_queries/versions.tf +++ /dev/null @@ -1,4 +0,0 @@ - -terraform { - required_version = ">= 0.12" -} diff --git a/streamalert_cli/terraform/athena.py b/streamalert_cli/terraform/athena.py index 23e19941a..73f994e10 100644 --- a/streamalert_cli/terraform/athena.py +++ b/streamalert_cli/terraform/athena.py @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. """ -from streamalert.athena_partition_refresh.main import AthenaRefresher from streamalert.shared import metrics +from streamalert.shared.config import athena_partition_buckets from streamalert_cli.manage_lambda.package import AthenaPackage from streamalert_cli.terraform.common import ( infinitedict, @@ -35,7 +35,7 @@ def generate_athena(config): athena_dict = infinitedict() athena_config = config['lambda']['athena_partition_refresh_config'] - data_buckets = sorted(AthenaRefresher.buckets_from_config(config)) + data_buckets = sorted(athena_partition_buckets(config)) prefix = config['global']['account']['prefix'] database = athena_config.get('database_name', '{}_streamalert'.format(prefix)) diff --git a/streamalert_cli/terraform/scheduled_queries.py b/streamalert_cli/terraform/scheduled_queries.py index 58dc00671..6041e80ca 100644 --- a/streamalert_cli/terraform/scheduled_queries.py +++ b/streamalert_cli/terraform/scheduled_queries.py @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -from streamalert.shared.config import firehose_alerts_bucket, firehose_data_bucket +from streamalert.shared.config import athena_partition_buckets, athena_query_results_bucket from streamalert_cli.manage_lambda.package import ScheduledQueriesPackage from streamalert_cli.terraform.common import monitoring_topic_arn @@ -35,15 +35,9 @@ def generate_scheduled_queries_module_configuration(config): # '${module.streamalert_athena.results_bucket_arn}' # Because it takes a bucket name, not an ARN # FIXME (derek.wang) DRY out this code - results_bucket = athena_config.get( - 'results_bucket', - '{}.streamalert.athena-results'.format(prefix) - ).strip() + results_bucket = athena_query_results_bucket(config) - athena_s3_buckets = [ - firehose_alerts_bucket(config), - firehose_data_bucket(config), - ] + athena_s3_buckets = athena_partition_buckets(config) # Copy the config over directly scheduled_queries_module = streamquery_config.get('config', {}) @@ -51,7 +45,6 @@ def generate_scheduled_queries_module_configuration(config): # Derive a bunch of required fields from other scheduled_queries_module.update({ 'source': './modules/tf_scheduled_queries', - 'prefix': prefix, 'account_id': config['global']['account']['aws_account_id'], 'region': config['global']['account']['region'],