diff --git a/lambda_functions/analyzer/analyzer_aws_lib.py b/lambda_functions/analyzer/analyzer_aws_lib.py index 20ca752..5618ccc 100644 --- a/lambda_functions/analyzer/analyzer_aws_lib.py +++ b/lambda_functions/analyzer/analyzer_aws_lib.py @@ -1,40 +1,54 @@ """Collection of boto3 calls to AWS resources for the analyzer function.""" import json -import logging +from typing import Dict, List, Set, Tuple, Union import boto3 +from boto3.dynamodb.conditions import Key + +if __package__: + from lambda_functions.analyzer.binary_info import BinaryInfo + from lambda_functions.analyzer.common import LOGGER +else: + from binary_info import BinaryInfo + from common import LOGGER -LOGGER = logging.getLogger() SNS_PUBLISH_SUBJECT_MAX_SIZE = 99 +# Build boto3 resources at import time so they can be cached between invocations. +CLOUDWATCH = boto3.client('cloudwatch') +DYNAMODB = boto3.resource('dynamodb') +S3 = boto3.resource('s3') +SNS = boto3.resource('sns') +SQS = boto3.resource('sqs') + -def download_from_s3(bucket_name, object_key, download_path): - """Download an object from S3 into local /tmp storage. +def download_from_s3( + bucket_name: str, object_key: str, download_path: str) -> Tuple[str, Dict[str, str]]: + """Download an object from S3 to the given download path. Args: - bucket_name: [string] S3 bucket name. - object_key: [string] S3 object key. - download_path: [string] Where to download the file locally. + bucket_name: S3 bucket name. + object_key: S3 object key. + download_path: Where to download the file locally. Returns: - [dict] S3 metadata. + Last modified timestamp (i.e. object upload timestamp), object metadata. """ - response = boto3.client('s3').get_object(Bucket=bucket_name, Key=object_key) - with open(download_path, 'wb') as file: - file.write(response['Body'].read()) + s3_object = S3.Object(bucket_name, object_key) + s3_object.download_file(download_path) + last_modified = str(s3_object.last_modified) # UTC timestamp, e.g. '2017-09-04 04:49:06-00:00' + return last_modified, s3_object.metadata - return response['Metadata'] - -def _elide_string_middle(text, max_length): +def _elide_string_middle(text: str, max_length: int) -> str: """Replace the middle of the text with ellipses to shorten text to the desired length. Args: - text: [string] Text to shorten. - max_length: [int] Maximum allowable length of the string. + text: Text to shorten. + max_length: Maximum allowable length of the string. Returns: - [string] The elided text, e.g. "Some really long tex ... the end." + The elided text, e.g. "Some really long tex ... the end." """ if len(text) <= max_length: return text @@ -43,60 +57,58 @@ def _elide_string_middle(text, max_length): return '{} ... {}'.format(text[:half_len], text[-half_len:]) -def publish_alert_to_sns(binary, topic_arn): +def publish_alert_to_sns(binary: BinaryInfo, topic_arn: str) -> None: """Publish a JSON SNS alert: a binary has matched one or more YARA rules. Args: - binary: [BinaryInfo] Instance containing information about the binary. - topic_arn: [string] Publish to this SNS topic ARN. + binary: Instance containing information about the binary. + topic_arn: Publish to this SNS topic ARN. """ - subject = 'BinaryAlert: {} matches a YARA rule'.format( - binary.observed_path or binary.reported_md5 or binary.computed_md5) - boto3.client('sns').publish( - TopicArn=topic_arn, + subject = '[BinaryAlert] {} matches a YARA rule'.format( + binary.filepath or binary.computed_sha) + SNS.Topic(topic_arn).publish( Subject=_elide_string_middle(subject, SNS_PUBLISH_SUBJECT_MAX_SIZE), Message=(json.dumps(binary.summary(), indent=4, sort_keys=True)) ) -def delete_sqs_messages(queue_url, receipts): +def delete_sqs_messages(queue_url: str, receipts: List[str]) -> None: """Mark a batch of SQS receipts as completed (removing them from the queue). Args: - queue_url: [string] The URL of the SQS queue containing the messages. - receipts: [list] List of SQS receipt handles. + queue_url: The URL of the SQS queue containing the messages. + receipts: List of SQS receipt handles. """ LOGGER.info('Deleting %d SQS receipt(s) from %s', len(receipts), queue_url) - boto3.client('sqs').delete_message_batch( - QueueUrl=queue_url, + SQS.Queue(queue_url).delete_messages( Entries=[ {'Id': str(index), 'ReceiptHandle': receipt} for index, receipt in enumerate(receipts)] ) -def _compute_statistics(values): +def _compute_statistics(values: List[Union[int, float]]) -> Dict[str, Union[int, float]]: """Compute summary statistics for a collection of values. Args: - values: [list] of numeric values in the sample set. + values: numeric values in the sample set. Returns: - [dict] designed to be published as CloudWatch metric statistics. + CloudWatch metric statistics dictionary. """ return { - 'SampleCount': len(values), - 'Sum': sum(values), 'Minimum': min(values), - 'Maximum': max(values) + 'Maximum': max(values), + 'SampleCount': len(values), + 'Sum': sum(values) } -def put_metric_data(num_yara_rules, binaries): +def put_metric_data(num_yara_rules: int, binaries: List[BinaryInfo]) -> None: """Publish custom metric data to CloudWatch. Args: - num_yara_rules: [string] Number of YARA rules in the analyzer. - binaries: [list of BinaryInfo()] List of analyzed BinaryInfo()s. + num_yara_rules: Number of YARA rules in the analyzer. + binaries: List of analyzed BinaryInfo()s. """ LOGGER.debug('Sending metric data') metric_data = [ @@ -121,107 +133,102 @@ def put_metric_data(num_yara_rules, binaries): 'Unit': 'Milliseconds' } ] - boto3.client('cloudwatch').put_metric_data(Namespace='BinaryAlert', MetricData=metric_data) + CLOUDWATCH.put_metric_data(Namespace='BinaryAlert', MetricData=metric_data) class DynamoMatchTable(object): """Saves YARA match information into a Dynamo table. The table uses a composite key: - SHA256: [string] [hash key] SHA256 digest computed over the binary blob. - LambdaVersion: [int] [range key] Lambda version that found the match. + SHA256 (str): [hash key] SHA256 digest computed over the binary blob. + AnalyzerVersion (int): [range key] Analyzer Lambda version that found the match. Additionally, items have the following attributes: - MD5_Computed: [string] MD5 digest computed over the binary blob. - MD5_Reported: [string] (optional) User-specified MD5 in the S3 object metadata. - Allows the user to upload a partial binary and still retain the original MD5, e.g. for - lookup in other incident response tools. - MatchedRules: [string set] A nonempty set of matched YARA rule names. - SamplePath: [string] (optional) User-specified observed filepath in the S3 object metadata. - S3Objects: [string set] A set of S3 keys containing the corresponding binary. + MatchedRules (Set[str]): A nonempty set of matched YARA rule names. + MD5 (str): MD5 digest computed over the binary blob. + S3LastModified (str): When the S3 object was last modified. + This shows when the file was uploaded (assuming no other modifications). + S3Metadata (Dict[str, str]): S3 object metadata for the first matching S3 object. + If the downloader is enabled, this will include CarbonBlack metadata (e.g. filename). + S3Objects (Set[str]): A set of S3 keys containing the corresponding binary. Duplicate uploads (multiple binaries with the same SHA) are allowed. """ - def __init__(self, table_name): + def __init__(self, table_name: str): """Establish connection to Dynamo. Args: - table_name: [string] The name of the Dynamo table containing match information. + table_name: The name of the Dynamo table containing match information. """ - self._table_name = table_name - self._client = boto3.client('dynamodb') + self._table = DYNAMODB.Table(table_name) - def _most_recent_item(self, sha): + def _most_recent_item(self, sha: str) -> Union[ + Tuple[int, Set[str], Set[str], Set[str]], None]: """Query the table for the most recent entry with the given SHA. Args: - sha: [string] SHA256 to query. + sha: SHA256 to query. Returns: - 3-tuple: ([int] LambdaVersion, [set] MatchedRules, [set] S3Objects) + 4-tuple: (AnalyzerVersion, MatchedRules, S3Objects, PreviousS3Objects) Returns None if there is no matching item. """ - most_recent_item = self._client.query( - TableName=self._table_name, + most_recent_items = self._table.query( Select='SPECIFIC_ATTRIBUTES', Limit=2, # We only need the most recent analyses. ConsistentRead=True, - ScanIndexForward=False, # Sort by LambdaVersion descending (e.g. newest first). - ProjectionExpression='LambdaVersion,MatchedRules,S3Objects', - KeyConditionExpression='SHA256 = :sha', - ExpressionAttributeValues={':sha': {'S': sha}} + ScanIndexForward=False, # Sort by AnalyzerVersion descending (e.g. newest first). + ProjectionExpression='AnalyzerVersion,MatchedRules,S3Objects', + KeyConditionExpression=Key('SHA256').eq(sha) ).get('Items') - if most_recent_item: - lambda_version = int(most_recent_item[0]['LambdaVersion']['N']) - matched_rules = set(most_recent_item[0]['MatchedRules']['SS']) - s3_objects = set(most_recent_item[0]['S3Objects']['SS']) + if most_recent_items: + analyzer_version = int(most_recent_items[0]['AnalyzerVersion']) + matched_rules = set(most_recent_items[0]['MatchedRules']) + s3_objects = set(most_recent_items[0]['S3Objects']) # When re-analyzing all binaries, only one S3 object will be added to the DB at a time. # In order to prevent spurious alerts about new S3 objects, we report S3 objects from - # the previous two Lambda versions. #Hacktastic! - if len(most_recent_item) >= 2: - s3_objects = s3_objects.union(set(most_recent_item[1]['S3Objects']['SS'])) - return lambda_version, matched_rules, s3_objects + # the previous Lambda version as well. + previous_s3_objects = {} + if len(most_recent_items) >= 2: + previous_s3_objects = set(most_recent_items[1]['S3Objects']) + return analyzer_version, matched_rules, s3_objects, previous_s3_objects else: return None - def _create_new_entry(self, binary, lambda_version): + def _create_new_entry(self, binary: BinaryInfo, analyzer_version: int) -> None: """Create a new Dynamo entry with YARA match information.""" - LOGGER.info('Creating new entry (SHA256: %s, LambdaVersion: %d)', - binary.computed_sha, lambda_version) + LOGGER.info('Creating new entry (SHA256: %s, AnalyzerVersion: %d)', + binary.computed_sha, analyzer_version) item = { - 'SHA256': {'S': binary.computed_sha}, - 'LambdaVersion': {'N': str(lambda_version)}, - 'MD5_Computed': {'S': binary.computed_md5}, - 'MatchedRules': {'SS': binary.matched_rule_ids}, - 'S3Objects': {'SS': [binary.s3_identifier]} + 'SHA256': binary.computed_sha, + 'AnalyzerVersion': analyzer_version, + 'MatchedRules': binary.matched_rule_ids, + 'MD5': binary.computed_md5, + 'S3LastModified': binary.s3_last_modified, + 'S3Metadata': binary.s3_metadata, + 'S3Objects': {binary.s3_identifier} } - if binary.reported_md5: - item['MD5_Reported'] = {'S': binary.reported_md5} - if binary.observed_path: - item['SamplePath'] = {'S': binary.observed_path} - - self._client.put_item(TableName=self._table_name, Item=item) + self._table.put_item(Item=item) - def _add_s3_key(self, binary, lambda_version): + def _add_s3_key(self, binary: BinaryInfo, analyzer_version: int) -> None: """Add S3 key to an existing entry. If the S3 key already exists, this is a no-op.""" - LOGGER.info('Adding %s to existing entry (SHA256: %s, LambdaVersion: %d)', - binary.s3_identifier, binary.computed_sha, lambda_version) - self._client.update_item( - TableName=self._table_name, - Key={'SHA256': {'S': binary.computed_sha}, 'LambdaVersion': {'N': str(lambda_version)}}, + LOGGER.info('Adding %s to existing entry (SHA256: %s, AnalyzerVersion: %d)', + binary.s3_identifier, binary.computed_sha, analyzer_version) + self._table.update_item( + Key={'SHA256': binary.computed_sha, 'AnalyzerVersion': analyzer_version}, UpdateExpression='ADD S3Objects :s3_string_set', - ExpressionAttributeValues={':s3_string_set': {'SS': [binary.s3_identifier]}} + ExpressionAttributeValues={':s3_string_set': {binary.s3_identifier}} ) - def save_matches(self, binary, lambda_version): + def save_matches(self, binary: BinaryInfo, analyzer_version: int) -> bool: """Save YARA match results to the Dynamo table. Args: - binary: [BinaryInfo] Instance containing information about the binary. - lambda_version: [int] Version of the currently executing Lambda function. + binary: Instance containing information about the binary. + analyzer_version: Version of the currently executing Lambda function. Returns: - [boolean] Whether an alert should be fired. Returns True if: + Whether an alert should be fired. Returns True if: The current Lambda version is >= the most recent analysis version AND (a) Any YARA rule is matched now that was not matched in the previous version, OR (b) A new S3 object appears which is identical to an already matched binary. @@ -233,27 +240,29 @@ def save_matches(self, binary, lambda_version): if item_tuple is not None: # An entry already exists for this SHA. - item_lambda_version, item_matched_rules, item_s3_objects = item_tuple + item_lambda_version, item_matched_rules, item_s3_objects, previous_objects = item_tuple # Update the DB appropriately. - if lambda_version != item_lambda_version: + if analyzer_version != item_lambda_version: # This binary has never been matched by this Lambda version. - self._create_new_entry(binary, lambda_version) + self._create_new_entry(binary, analyzer_version) elif binary.s3_identifier not in item_s3_objects: # A new S3 object is identical to a previously-matched binary. - self._add_s3_key(binary, lambda_version) + self._add_s3_key(binary, analyzer_version) # Decide whether we need to alert. - if lambda_version < item_lambda_version: + if analyzer_version < item_lambda_version: LOGGER.warning('Current Lambda version %d is < version %d from previous analysis', - lambda_version, item_lambda_version) - elif (bool(set(binary.matched_rule_ids) - item_matched_rules) or - binary.s3_identifier not in item_s3_objects): - # Either a new YARA rule matched or a new S3 object was found. + analyzer_version, item_lambda_version) + elif bool(binary.matched_rule_ids - item_matched_rules): + # A new YARA rule matched this binary. + needs_alert = True + elif binary.s3_identifier not in item_s3_objects.union(previous_objects): + # A new S3 object matched (which did not match in the previous version). needs_alert = True else: # This binary has never been matched before. - self._create_new_entry(binary, lambda_version) + self._create_new_entry(binary, analyzer_version) needs_alert = True return needs_alert diff --git a/lambda_functions/analyzer/binary_info.py b/lambda_functions/analyzer/binary_info.py index 8bfb3b1..f6fe497 100644 --- a/lambda_functions/analyzer/binary_info.py +++ b/lambda_functions/analyzer/binary_info.py @@ -1,30 +1,32 @@ """Keeps track of all information associated with and computed about a binary.""" -import logging import os import tempfile import time +from typing import Any, Dict, Set import uuid if __package__: # Imported by unit tests or other external code. from lambda_functions.analyzer import analyzer_aws_lib, file_hash + from lambda_functions.analyzer.common import LOGGER + from lambda_functions.analyzer.yara_analyzer import YaraAnalyzer else: import analyzer_aws_lib + from common import LOGGER import file_hash - -LOGGER = logging.getLogger() + from yara_analyzer import YaraAnalyzer class BinaryInfo(object): """Organizes the analysis of a single binary blob in S3.""" - def __init__(self, bucket_name, object_key, yara_analyzer): + def __init__(self, bucket_name: str, object_key: str, yara_analyzer: YaraAnalyzer): """Create a new BinaryInfo. Args: - bucket_name: [string] S3 bucket name. - object_key: [string] S3 object key. - yara_analyzer: [YaraAnalyzer] built from a compiled rules file. + bucket_name: S3 bucket name. + object_key: S3 object key. + yara_analyzer: Analyzer built from a compiled rules file. """ self.bucket_name = bucket_name self.object_key = object_key @@ -36,19 +38,25 @@ def __init__(self, bucket_name, object_key, yara_analyzer): # Computed after file download and analysis. self.download_time_ms = 0 - self.reported_md5 = self.observed_path = '' - self.computed_sha = self.computed_md5 = None + self.s3_last_modified = '' + self.s3_metadata = {} + self.computed_md5 = None + self.computed_sha = None self.yara_matches = [] # List of yara.Match objects. - @property - def matched_rule_ids(self): - """A list of 'yara_file:rule_name' for each YARA match.""" - return ['{}:{}'.format(match.namespace, match.rule) for match in self.yara_matches] - def __str__(self): """Use the S3 identifier as the string representation of the binary.""" return self.s3_identifier + def _download_from_s3(self): + """Download binary from S3 and measure elapsed time.""" + LOGGER.debug('Downloading %s to %s', self.object_key, self.download_path) + + start_time = time.time() + self.s3_last_modified, self.s3_metadata = analyzer_aws_lib.download_from_s3( + self.bucket_name, self.object_key, self.download_path) + self.download_time_ms = (time.time() - start_time) * 1000 + def __enter__(self): """Download the binary from S3 and run YARA analysis.""" self._download_from_s3() @@ -56,7 +64,8 @@ def __enter__(self): LOGGER.debug('Running YARA analysis') self.yara_matches = self.yara_analyzer.analyze( - self.download_path, original_target_path=self.observed_path) + self.download_path, original_target_path=self.filepath + ) return self @@ -69,46 +78,45 @@ def __exit__(self, exception_type, exception_value, traceback): file.truncate() os.remove(self.download_path) - def _download_from_s3(self): - """Download binary from S3 and measure elapsed time.""" - LOGGER.debug('Downloading %s to %s', self.object_key, self.download_path) - - start_time = time.time() - s3_metadata = analyzer_aws_lib.download_from_s3( - self.bucket_name, self.object_key, self.download_path) - self.download_time_ms = (time.time() - start_time) * 1000 + @property + def matched_rule_ids(self) -> Set[str]: + """A list of 'yara_file:rule_name' for each YARA match.""" + return set('{}:{}'.format(match.namespace, match.rule) for match in self.yara_matches) - self.reported_md5 = s3_metadata.get('reported_md5', '') - self.observed_path = s3_metadata.get('observed_path', '') + @property + def filepath(self) -> str: + """The filepath from the S3 metadata, if present.""" + return self.s3_metadata.get('filepath', '') - def save_matches_and_alert(self, lambda_version, dynamo_table_name, sns_topic_arn): + def save_matches_and_alert( + self, analyzer_version: int, dynamo_table_name: str, sns_topic_arn: str) -> None: """Save match results to Dynamo and publish an alert to SNS if appropriate. Args: - lambda_version: [int] The currently executing version of the Lambda function. - dynamo_table_name: [string] Save YARA match results to this Dynamo table. - sns_topic_arn: [string] Publish match alerts to this SNS topic ARN. + analyzer_version: The currently executing version of the Lambda function. + dynamo_table_name: Save YARA match results to this Dynamo table. + sns_topic_arn: Publish match alerts to this SNS topic ARN. """ table = analyzer_aws_lib.DynamoMatchTable(dynamo_table_name) - needs_alert = table.save_matches(self, lambda_version) + needs_alert = table.save_matches(self, analyzer_version) # Send alert if appropriate. if needs_alert: LOGGER.info('Publishing an SNS alert') analyzer_aws_lib.publish_alert_to_sns(self, sns_topic_arn) - def summary(self): + def summary(self) -> Dict[str, Any]: """Generate a summary dictionary of binary attributes.""" result = { 'FileInfo': { - 'ComputedMD5': self.computed_md5, - 'ComputedSHA256': self.computed_sha, - 'ReportedMD5': self.reported_md5, + 'MD5': self.computed_md5, + 'S3LastModified': self.s3_last_modified, 'S3Location': self.s3_identifier, - 'SamplePath': self.observed_path + 'S3Metadata': self.s3_metadata, + 'SHA256': self.computed_sha }, - 'NumMatchedRules': len(self.yara_matches), - 'MatchedRules': {} + 'MatchedRules': {}, + 'NumMatchedRules': len(self.yara_matches) } for index, match in enumerate(self.yara_matches, start=1): diff --git a/lambda_functions/analyzer/common.py b/lambda_functions/analyzer/common.py new file mode 100644 index 0000000..d8f6114 --- /dev/null +++ b/lambda_functions/analyzer/common.py @@ -0,0 +1,11 @@ +"""Common resources shared among the analyzer components.""" +import logging +import os + +LOGGER = logging.getLogger() +LOGGER.setLevel(logging.INFO) + +# Define the name and location of the compiled YARA rules file. +COMPILED_RULES_FILENAME = 'compiled_yara_rules.bin' +THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) # Directory containing this file. +COMPILED_RULES_FILEPATH = os.path.join(THIS_DIRECTORY, COMPILED_RULES_FILENAME) diff --git a/lambda_functions/analyzer/file_hash.py b/lambda_functions/analyzer/file_hash.py index 9fb8bfa..dac25c8 100644 --- a/lambda_functions/analyzer/file_hash.py +++ b/lambda_functions/analyzer/file_hash.py @@ -1,18 +1,20 @@ """Memory-efficient file hashing.""" import hashlib +import io +from typing import Tuple MB = 2 ** 20 # ~ 1 million bytes -def _read_in_chunks(file_object, chunk_size=2*MB): +def _read_in_chunks(file_object: io.FileIO, chunk_size: int = 2*MB) -> str: """Read a file in fixed-size chunks (to minimize memory usage for large files). Args: file_object: An opened file-like object supporting read(). - chunk_size: [int] Max size (in bytes) of each file chunk. + chunk_size: Max size (in bytes) of each file chunk. Yields: - [string] file chunks, each of size at most chunk_size. + File chunks, each of size at most chunk_size. """ while True: chunk = file_object.read(chunk_size) @@ -22,16 +24,16 @@ def _read_in_chunks(file_object, chunk_size=2*MB): return # End of file. -def compute_hashes(file_path): +def compute_hashes(file_path: str) -> Tuple[str, str]: """Compute SHA and MD5 hashes for the specified file object. The MD5 is only included to be compatible with other security tools. Args: - file_path: [string] File path to be analyzed. + file_path: File path to be analyzed. Returns: - String tuple (sha_hash, md5_hash). + SHA256 hash, MD5 hash. """ sha = hashlib.sha256() md5 = hashlib.md5() diff --git a/lambda_functions/analyzer/main.py b/lambda_functions/analyzer/main.py index eb12f72..6cf8619 100644 --- a/lambda_functions/analyzer/main.py +++ b/lambda_functions/analyzer/main.py @@ -5,42 +5,31 @@ # YARA_MATCHES_DYNAMO_TABLE_NAME: Name of the Dynamo table which stores YARA match results. # YARA_ALERTS_SNS_TOPIC_ARN: ARN of the SNS topic which should be alerted on a YARA match. # Expects a binary YARA rules file to be at './compiled_yara_rules.bin' -import logging import os +from typing import Any, Dict import urllib -from yara import Error as YaraError from botocore.exceptions import ClientError as BotoError if __package__: # Imported by unit tests or other external code. from lambda_functions.analyzer import analyzer_aws_lib, binary_info, yara_analyzer + from lambda_functions.analyzer.common import COMPILED_RULES_FILEPATH, LOGGER else: import analyzer_aws_lib import binary_info + from common import COMPILED_RULES_FILEPATH, LOGGER import yara_analyzer -LOGGER = logging.getLogger() -LOGGER.setLevel(logging.INFO) - -THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) # Directory containing this file. -COMPILED_RULES_FILENAME = 'compiled_yara_rules.bin' # Binary YARA rules file. -COMPILED_RULES_FILEPATH = os.path.join(THIS_DIRECTORY, COMPILED_RULES_FILENAME) - # Build the YaraAnalyzer from the compiled rules file at import time (i.e. once per container). # This saves 50-100+ ms per Lambda invocation, depending on the size of the rules file. -# However, this breaks imports when the compiled rules file doesn't exist (e.g. unit tests). -# Fail-safe to computing the ANALYZER during the handler if we have to. -try: - ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH) - # Due to a bug in yara-python, num_rules only be computed once. Thereafter, it will return 0. - # So we have to compute this here since multiple invocations may share the same analyzer. - NUM_YARA_RULES = ANALYZER.num_rules -except YaraError: - ANALYZER = None +ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH) +# Due to a bug in yara-python, num_rules only be computed once. Thereafter, it will return 0. +# So we have to compute this here since multiple invocations may share the same analyzer. +NUM_YARA_RULES = ANALYZER.num_rules -def analyze_lambda_handler(event_data, lambda_context): +def analyze_lambda_handler(event_data: Dict[str, Any], lambda_context) -> Dict[str, Dict[str, Any]]: """Lambda function entry point. Args: @@ -52,18 +41,18 @@ def analyze_lambda_handler(event_data, lambda_context): lambda_context: LambdaContext object (with .function_version). Returns: - A dict mapping S3 object identifier [string] to a summary [dict] of file info and matched - YARA rule information. + A dict mapping S3 object identifier to a summary of file info and matched YARA rules. + Example: { + 'S3:bucket:key': { + 'FileInfo': { ... }, + 'MatchedRules': { ... }, + 'NumMatchedRules': 1 + } + } """ result = {} binaries = [] # List of the BinaryInfo data. - # Build the YaraAnalyzer now if we could not do it when this file was imported. - global ANALYZER, NUM_YARA_RULES # pylint: disable=global-statement - if not ANALYZER: - ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH) - NUM_YARA_RULES = ANALYZER.num_rules - # The Lambda version must be an integer. try: lambda_version = int(lambda_context.function_version) diff --git a/lambda_functions/analyzer/yara_analyzer.py b/lambda_functions/analyzer/yara_analyzer.py index 9ab987d..2a24194 100644 --- a/lambda_functions/analyzer/yara_analyzer.py +++ b/lambda_functions/analyzer/yara_analyzer.py @@ -1,5 +1,6 @@ """Wrapper around YARA analysis.""" import os +from typing import Dict, List import yara @@ -7,28 +8,28 @@ class YaraAnalyzer(object): """Encapsulates YARA analysis and matching functions.""" - def __init__(self, rules_file): + def __init__(self, rules_file: str): """Initialize the analyzer with a prebuilt binary YARA rules file. Args: - rules_file: [string] Path to the binary rules file. + rules_file: Path to the binary rules file. """ self._rules = yara.load(rules_file) @property - def num_rules(self): + def num_rules(self) -> int: """Count the number of YARA rules loaded in the analyzer.""" return sum(1 for _ in self._rules) @staticmethod - def _yara_variables(original_target_path): + def _yara_variables(original_target_path: str) -> Dict[str, str]: """Compute external variables needed for some YARA rules. Args: - original_target_path: [string] Path where the binary was originally discovered. + original_target_path: Path where the binary was originally discovered. Returns: - A dictionary mapping string variable names to string values. + A map from YARA variable names to their computed values. """ file_name = os.path.basename(original_target_path) file_suffix = file_name.split('.')[-1] if '.' in file_name else '' # e.g. "exe" or "rar". @@ -39,12 +40,12 @@ def _yara_variables(original_target_path): 'filetype': file_suffix.upper() # Used in only one rule (checking for "GIF"). } - def analyze(self, target_file, original_target_path=''): + def analyze(self, target_file: str, original_target_path: str = '') -> List: """Run YARA analysis on a file. Args: - target_file: [string] Local path to target file to be analyzed. - original_target_path: [string] Path where the target file was originally discovered. + target_file: Local path to target file to be analyzed. + original_target_path: Path where the target file was originally discovered. Returns: List of yara.Match objects. diff --git a/lambda_functions/build.py b/lambda_functions/build.py index bf369d9..3553074 100644 --- a/lambda_functions/build.py +++ b/lambda_functions/build.py @@ -7,7 +7,7 @@ import pip -from lambda_functions.analyzer.main import COMPILED_RULES_FILENAME +from lambda_functions.analyzer.common import COMPILED_RULES_FILENAME from rules.compile_rules import compile_rules LAMBDA_DIR = os.path.dirname(os.path.realpath(__file__)) diff --git a/lambda_functions/downloader/main.py b/lambda_functions/downloader/main.py index 15b7261..14ae51c 100644 --- a/lambda_functions/downloader/main.py +++ b/lambda_functions/downloader/main.py @@ -56,19 +56,18 @@ def _download_from_carbon_black(binary: Binary) -> str: @backoff.on_exception(backoff.expo, (ObjectNotFoundError, zipfile.BadZipFile), max_tries=8, jitter=backoff.full_jitter) def _build_metadata(binary: Binary) -> Dict[str, str]: - """Return basic CarbonBlack metadata to make it easier to triage YARA match alerts.""" + """Return basic metadata to make it easier to triage YARA match alerts.""" LOGGER.info('Retrieving binary metadata') return { 'carbon_black_group': ','.join(binary.group), 'carbon_black_host_count': str(binary.host_count), - 'carbon_black_last_seen': binary.last_seen, 'carbon_black_md5': binary.md5, - 'carbon_black_observed_filename': ( + 'carbon_black_os_type': binary.os_type, + 'carbon_black_virustotal_score': str(binary.virustotal.score), + 'filepath': ( # Throw out any non-ascii characters (S3 metadata must be ascii). binary.observed_filenames[0].encode('ascii', 'ignore').decode('ascii') - ), - 'carbon_black_os_type': binary.os_type, - 'carbon_black_virustotal_score': str(binary.virustotal.score) + ) } diff --git a/manage.py b/manage.py index 2cf6002..32113aa 100644 --- a/manage.py +++ b/manage.py @@ -425,7 +425,7 @@ def live_test(self) -> None: bucket.put_object( Body=EICAR_STRING.encode('UTF-8'), Key=test_filename, - Metadata={'observed_path': test_filename} + Metadata={'filepath': test_filename} ) table_name = '{}_binaryalert_matches'.format(self._config.name_prefix) @@ -442,7 +442,7 @@ def live_test(self) -> None: Select='ALL_ATTRIBUTES', Limit=1, ConsistentRead=True, - ScanIndexForward=False, # Sort by LambdaVersion descending (e.g. newest first). + ScanIndexForward=False, # Sort by AnalyzerVersion descending (e.g. newest first). KeyConditionExpression=Key('SHA256').eq(eicar_sha256), FilterExpression=Attr('S3Objects').contains(s3_identifier) ).get('Items') @@ -453,8 +453,8 @@ def live_test(self) -> None: pprint.pprint(items[0]) print('\nRemoving DynamoDB EICAR entry...') - lambda_version = items[0]['LambdaVersion'] - table.delete_item(Key={'SHA256': eicar_sha256, 'LambdaVersion': lambda_version}) + lambda_version = items[0]['AnalyzerVersion'] + table.delete_item(Key={'SHA256': eicar_sha256, 'AnalyzerVersion': lambda_version}) break elif attempt == 10: print('\nFAIL: Expected DynamoDB entry for the EICAR file was *not* found!\n') diff --git a/terraform/dynamo.tf b/terraform/dynamo.tf index 1b606a8..48344cd 100644 --- a/terraform/dynamo.tf +++ b/terraform/dynamo.tf @@ -2,7 +2,7 @@ resource "aws_dynamodb_table" "binaryalert_yara_matches" { name = "${var.name_prefix}_binaryalert_matches" hash_key = "SHA256" - range_key = "LambdaVersion" + range_key = "AnalyzerVersion" read_capacity = "${var.dynamo_read_capacity}" write_capacity = "${var.dynamo_write_capacity}" @@ -13,7 +13,7 @@ resource "aws_dynamodb_table" "binaryalert_yara_matches" { } attribute { - name = "LambdaVersion" + name = "AnalyzerVersion" type = "N" } diff --git a/tests/boto3_mocks.py b/tests/boto3_mocks.py index d2c026d..261b551 100644 --- a/tests/boto3_mocks.py +++ b/tests/boto3_mocks.py @@ -2,9 +2,6 @@ # This is similar to the moto library. Unfortunately, moto does not support complex Dynamo updates, # has a limited feature set, and has some Python3 bugs. To keep it simple, we instead build only # the simplest mocks we need. -import collections -import io - from botocore.vendored.requests.adapters import HTTPAdapter @@ -31,114 +28,3 @@ def __init__(self, function_version=1, time_limit_ms=30000): def get_remaining_time_in_millis(self): """Returns the original time limit (self.time_limit_ms).""" return self.time_limit_ms - - -class MockCloudwatchCient(object): - """http://boto3.readthedocs.io/en/latest/reference/services/cloudwatch.html#client""" - def __init__(self): - # Maps namespace to list of published metric data. - self.metric_data = collections.defaultdict(list) - - def put_metric_data(self, **kwargs): - """Published metric data is added to an in-memory list.""" - self.metric_data[kwargs['Namespace']].extend(kwargs['MetricData']) - - -class MockDynamoItem(object): - """Stores original item values and a list of update strings.""" - def __init__(self, key_value_dict): - self.key_value_dict = key_value_dict - self.updates = [] - - def update(self, update_expression): - """Append a new update to the item.""" - self.updates.append(update_expression) - - -class MockDynamoTable(object): - """Supports only String, StringSet, Map, and List.""" - def __init__(self, table_name, hash_key, range_key): - """Creates mock dynamo table with numeric range key.""" - self.table_name = table_name - self.hash_key = hash_key - self.range_key = range_key - self.items = {} # Maps composite key to MockDynamoItem. - - def _composite_key(self, item): - """Create a tuple composite key out of an item's attributes.""" - return item[self.hash_key]['S'], item[self.range_key]['N'] - - def put_item(self, item): - """Add a new Dynamo item (dict of key-value pairs).""" - self.items[self._composite_key(item)] = MockDynamoItem(item) - - def query(self, hash_key_value): - """Return all original item values for the given SHA (updates are NOT applied).""" - return {'Items': [item.key_value_dict for item in self.items.values() if - item.key_value_dict[self.hash_key]['S'] == hash_key_value]} - - def update_item(self, key_expr, update_expression): - """Append a new update to the item with the given hash/range key.""" - self.items[self._composite_key(key_expr)].update(update_expression) - - -class MockDynamoDBClient(object): - """http://boto3.readthedocs.io/en/latest/reference/services/dynamodb.html#client""" - def __init__(self, table_name, hash_key, range_key): - """Create boto3.client('dynamodb') with a single pre-existing table.""" - self.tables = {table_name: MockDynamoTable(table_name, hash_key, range_key)} - - def put_item(self, **kwargs): - """Put a new item in the table.""" - table = self.tables[kwargs['TableName']] - table.put_item(kwargs['Item']) - - def query(self, **kwargs): - """Get item details by SHA256.""" - return self.tables[kwargs['TableName']].query( - kwargs['ExpressionAttributeValues'][':sha']['S']) - - def update_item(self, **kwargs): - """Update an existing item in the table (appends rather than replaces).""" - table = self.tables[kwargs['TableName']] - # Replace attribute values in update expression. - update_expr = kwargs['UpdateExpression'] - for attr_key, attr_value in kwargs.get('ExpressionAttributeValues', {}).items(): - update_expr = update_expr.replace(attr_key, str(attr_value)) - table.update_item(kwargs['Key'], update_expr) - - -class MockS3Client(object): - """http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Client""" - def __init__(self, bucket_name, object_key, file_contents, file_metadata): - """Create boto3.client('s3'), pre-populated with test data.""" - self.buckets = {bucket_name: {object_key: (file_contents, file_metadata)}} - - def get_object(self, **kwargs): - """Return object contents as bytesIO.""" - file_contents, file_metadata = self.buckets[kwargs['Bucket']][kwargs['Key']] - return { - 'Body': io.BytesIO(file_contents.encode('utf-8')), - 'Metadata': file_metadata - } - - -class MockSNSClient(object): - """http://boto3.readthedocs.io/en/latest/reference/services/sns.html#client""" - def __init__(self): - self.topics = collections.defaultdict(list) # Maps topic arn to list of publications. - - def publish(self, **kwargs): - """Record an SNS publication in the history.""" - self.topics[kwargs['TopicArn']].append(kwargs) - - -class MockSQSClient(object): - """http://boto3.readthedocs.io/en/latest/reference/services/sqs.html#client""" - def __init__(self, queue_url, receipts): - self.queues = {queue_url: receipts} - - def delete_message_batch(self, **kwargs): - """Delete a batch of SQS message receipts.""" - for entry in kwargs['Entries']: - self.queues[kwargs['QueueUrl']].remove(entry['ReceiptHandle']) diff --git a/tests/lambda_functions/analyzer/analyzer_aws_lib_test.py b/tests/lambda_functions/analyzer/analyzer_aws_lib_test.py index 17932d6..4badc37 100644 --- a/tests/lambda_functions/analyzer/analyzer_aws_lib_test.py +++ b/tests/lambda_functions/analyzer/analyzer_aws_lib_test.py @@ -1,23 +1,16 @@ """Unit tests for analyzer_aws_lib.py. Uses mock boto3 clients.""" +# pylint: disable=protected-access import unittest from unittest import mock -import boto3 - from lambda_functions.analyzer import analyzer_aws_lib, binary_info -from tests import boto3_mocks, yara_mocks +from tests import yara_mocks MOCK_DYNAMO_TABLE_NAME = 'mock-dynamo-table' -HASH_KEY = 'SHA256' -RANGE_KEY = 'LambdaVersion' - -REAL_BOTO3_CLIENT = boto3.client class AnalyzerAWSLibStandaloneTest(unittest.TestCase): """Test top-level functions in analyzer_aws_lib.py""" - # pylint: disable=protected-access - def test_elide_string_middle(self): """Check that string elision works as expected (generated string is not too long).""" alphabet = 'abcdefghijklmnopqrstuvwxyz' @@ -34,106 +27,195 @@ def test_elide_string_middle(self): self.assertEqual(alphabet, analyzer_aws_lib._elide_string_middle(alphabet, 50)) +@mock.patch.object(analyzer_aws_lib, 'DYNAMODB') class DynamoMatchTableTest(unittest.TestCase): """Test DynamoMatchTable class.""" def setUp(self): - """Before each test, create the mock environment.""" - # Create a mock Dynamo table. - self._mock_dynamo_client = boto3_mocks.MockDynamoDBClient( - MOCK_DYNAMO_TABLE_NAME, HASH_KEY, RANGE_KEY) - self._mock_dynamo_table = self._mock_dynamo_client.tables[MOCK_DYNAMO_TABLE_NAME] - - # Setup mocks. - boto3.client = mock.MagicMock(return_value=self._mock_dynamo_client) - - self._binary = binary_info.BinaryInfo('Bucket', 'Key', None) - self._binary.reported_md5 = 'Original_MD5' - self._binary.observed_path = '/bin/path/run.exe' - self._binary.yara_matches = [yara_mocks.YaraMatchMock('file.yara', 'rule_name')] - self._binary.computed_sha = 'Computed_SHA' + """Before each test, setup a BinaryInfo.""" + self._binary = binary_info.BinaryInfo('test-bucket', 'test-key', None) + self._binary.s3_last_modified = 'time:right_now' + self._binary.s3_metadata = {'test-filename': 'test.txt'} self._binary.computed_md5 = 'Computed_MD5' + self._binary.computed_sha = 'Computed_SHA' + self._binary.yara_matches = [yara_mocks.YaraMatchMock('file.yara', 'rule_name')] - self._match_table = analyzer_aws_lib.DynamoMatchTable(MOCK_DYNAMO_TABLE_NAME) - - @classmethod - def tearDown(cls): - """Restore the mocked out methods to their originals for other unit tests.""" - boto3.client = REAL_BOTO3_CLIENT - - def _add_item(self, lambda_version=1, s3_objects=None): - """Add an item to the mock Dynamo table.""" - self._mock_dynamo_table.put_item( - { - 'SHA256': {'S': self._binary.computed_sha}, - 'LambdaVersion': {'N': str(lambda_version)}, - 'MatchedRules': {'SS': self._binary.matched_rule_ids}, - 'S3Objects': {'SS': s3_objects or [self._binary.s3_identifier]} - } - ) - - def test_new_sha(self): + def test_new_sha(self, mock_table: mock.MagicMock): """A binary matches YARA rules for the first time - create DB entry and alert.""" - needs_alert = self._match_table.save_matches(self._binary, 1) + match_table = analyzer_aws_lib.DynamoMatchTable(MOCK_DYNAMO_TABLE_NAME) + match_table._table.query = lambda **kwargs: {} - self.assertTrue(needs_alert) - stored_item = self._mock_dynamo_table.items[(self._binary.computed_sha, '1')] - for expected in [self._binary.observed_path, self._binary.matched_rule_ids[0]]: - self.assertTrue(expected in str(stored_item.key_value_dict.values())) + needs_alert = match_table.save_matches(self._binary, 1) - def test_new_version_same_rules_same_objects(self): - """Same results with new Lambda version - create DB entry but do not alert.""" - self._add_item() - needs_alert = self._match_table.save_matches(self._binary, 2) + self.assertTrue(needs_alert) + mock_table.assert_has_calls([ + mock.call.Table().put_item(Item={ + 'SHA256': 'Computed_SHA', + 'AnalyzerVersion': 1, + 'MatchedRules': {'file.yara:rule_name'}, + 'MD5': 'Computed_MD5', + 'S3LastModified': 'time:right_now', + 'S3Metadata': {'test-filename': 'test.txt'}, + 'S3Objects': {'S3:test-bucket:test-key'} + }) + ]) + + def test_new_version_same_rules_same_objects(self, mock_table: mock.MagicMock): + """Same results with new Lambda version - create new DB entry but do not alert.""" + match_table = analyzer_aws_lib.DynamoMatchTable(MOCK_DYNAMO_TABLE_NAME) + match_table._table.query = lambda **kwargs: { + 'Items': [ + { + 'AnalyzerVersion': 1, + 'MatchedRules': {'file.yara:rule_name'}, + 'S3Objects': {'S3:test-bucket:test-key'} + } + ] + } + + needs_alert = match_table.save_matches(self._binary, 2) self.assertFalse(needs_alert) - self.assertEqual([(self._binary.computed_sha, '1'), (self._binary.computed_sha, '2')], - sorted(self._mock_dynamo_table.items)) - - def test_new_version_multiple_objects(self): - """Multiple S3 objects can be added without triggering an alert if seen previously.""" - self._add_item(lambda_version=1, s3_objects=['S3_1', 'S3_2', 'S3_3']) + mock_table.assert_has_calls([ + mock.call.Table().put_item(Item={ + 'SHA256': 'Computed_SHA', + 'AnalyzerVersion': 2, + 'MatchedRules': {'file.yara:rule_name'}, + 'MD5': 'Computed_MD5', + 'S3LastModified': 'time:right_now', + 'S3Metadata': {'test-filename': 'test.txt'}, + 'S3Objects': {'S3:test-bucket:test-key'} + }) + ]) + + def test_new_version_multiple_objects(self, mock_table: mock.MagicMock): + """No alerts should fire for any of multiple binaries which have already matched.""" + match_table = analyzer_aws_lib.DynamoMatchTable(MOCK_DYNAMO_TABLE_NAME) + match_table._table.query = lambda **kwargs: { + 'Items': [ + { + 'AnalyzerVersion': 1, + 'MatchedRules': {'file.yara:rule_name'}, + 'S3Objects': {'S3_1', 'S3_2', 'S3_3'} + } + ] + } self._binary.s3_identifier = 'S3_1' - self.assertFalse(self._match_table.save_matches(self._binary, 2)) + self.assertFalse(match_table.save_matches(self._binary, 2)) + + match_table._table.query = lambda **kwargs: { + 'Items': [ + { + 'AnalyzerVersion': 2, + 'MatchedRules': {'file.yara:rule_name'}, + 'S3Objects': {'S3_1'} + }, + { + 'AnalyzerVersion': 1, + 'MatchedRules': {'file.yara:rule_name'}, + 'S3Objects': {'S3_1', 'S3_2', 'S3_3'} + } + ] + } self._binary.s3_identifier = 'S3_2' - self.assertFalse(self._match_table.save_matches(self._binary, 2)) + self.assertFalse(match_table.save_matches(self._binary, 2)) self._binary.s3_identifier = 'S3_3' - self.assertFalse(self._match_table.save_matches(self._binary, 2)) - - def test_new_version_new_rules_same_objects(self): + self.assertFalse(match_table.save_matches(self._binary, 2)) + + mock_table.assert_has_calls([ + mock.call.Table().put_item(Item=mock.ANY), + mock.call.Table().update_item( + ExpressionAttributeValues={':s3_string_set': {'S3_2'}}, + Key={'SHA256': 'Computed_SHA', 'AnalyzerVersion': 2}, + UpdateExpression='ADD S3Objects :s3_string_set' + ), + mock.call.Table().update_item( + ExpressionAttributeValues={':s3_string_set': {'S3_3'}}, + Key={'SHA256': 'Computed_SHA', 'AnalyzerVersion': 2}, + UpdateExpression='ADD S3Objects :s3_string_set' + ) + ]) + + def test_new_version_new_rules_same_objects(self, mock_table: mock.MagicMock): """A previously analyzed binary matches a new YARA rule - create DB entry and alert.""" - self._add_item() + match_table = analyzer_aws_lib.DynamoMatchTable(MOCK_DYNAMO_TABLE_NAME) + match_table._table.query = lambda **kwargs: { + 'Items': [ + { + 'AnalyzerVersion': 1, + 'MatchedRules': {'file.yara:rule_name'}, + 'S3Objects': {'S3:test-bucket:test-key'} + } + ] + } self._binary.yara_matches.append( - yara_mocks.YaraMatchMock('new_file.yara', 'better_rule_name')) - needs_alert = self._match_table.save_matches(self._binary, 2) + yara_mocks.YaraMatchMock('new_file.yara', 'different_rule_name')) - self.assertTrue(needs_alert) - stored_item = self._mock_dynamo_table.items[(self._binary.computed_sha, '2')] - self.assertTrue('new_file.yara' in str(stored_item.key_value_dict.values())) + needs_alert = match_table.save_matches(self._binary, 2) - def test_same_version_same_rules_new_object(self): + self.assertTrue(needs_alert) + mock_table.assert_has_calls([ + mock.call.Table().put_item(Item={ + 'SHA256': 'Computed_SHA', + 'AnalyzerVersion': 2, + 'MatchedRules': {'new_file.yara:different_rule_name', 'file.yara:rule_name'}, + 'MD5': 'Computed_MD5', + 'S3LastModified': 'time:right_now', + 'S3Metadata': {'test-filename': 'test.txt'}, + 'S3Objects': {'S3:test-bucket:test-key'}}) + ]) + + def test_same_version_same_rules_new_object(self, mock_table: mock.MagicMock): """The only thing that changes is a new S3 key - update DB entry and alert.""" - self._add_item() + match_table = analyzer_aws_lib.DynamoMatchTable(MOCK_DYNAMO_TABLE_NAME) + match_table._table.query = lambda **kwargs: { + 'Items': [ + { + 'AnalyzerVersion': 1, + 'MatchedRules': {'file.yara:rule_name'}, + 'S3Objects': {'S3:test-bucket:test-key'} + } + ] + } self._binary.s3_identifier = 'S3:{}:{}'.format(self._binary.bucket_name, 'NEW_KEY') - needs_alert = self._match_table.save_matches(self._binary, 1) - self.assertTrue(needs_alert) - stored_item = self._mock_dynamo_table.items[(self._binary.computed_sha, '1')] - for expected in ['ADD', 'NEW_KEY']: - self.assertTrue(expected in stored_item.updates[0]) + needs_alert = match_table.save_matches(self._binary, 1) - def test_old_version(self): + self.assertTrue(needs_alert) + mock_table.assert_has_calls([ + mock.call.Table().update_item( + ExpressionAttributeValues={':s3_string_set': {'S3:test-bucket:NEW_KEY'}}, + Key={'SHA256': 'Computed_SHA', 'AnalyzerVersion': 1}, + UpdateExpression='ADD S3Objects :s3_string_set' + ) + ]) + + @mock.patch.object(analyzer_aws_lib, 'LOGGER') + def test_old_version(self, mock_logger: mock.MagicMock, mock_table: mock.MagicMock): """Analyze with an older version of the Lambda function - update DB but do not alert.""" - self._add_item() + match_table = analyzer_aws_lib.DynamoMatchTable(MOCK_DYNAMO_TABLE_NAME) + match_table._table.query = lambda **kwargs: { + 'Items': [ + { + 'AnalyzerVersion': 1, + 'MatchedRules': {'file.yara:rule_name'}, + 'S3Objects': {'S3:test-bucket:test-key'} + } + ] + } self._binary.yara_matches.append( - yara_mocks.YaraMatchMock('new_file.yara', 'better_rule_name')) - needs_alert = self._match_table.save_matches(self._binary, 0) + yara_mocks.YaraMatchMock('new_file.yara', 'different_rule_name')) + needs_alert = match_table.save_matches(self._binary, 0) self.assertFalse(needs_alert) # Don't alert even if there was a change - self.assertEqual([(self._binary.computed_sha, '0'), (self._binary.computed_sha, '1')], - sorted(self._mock_dynamo_table.items)) + mock_logger.assert_has_calls([ + mock.call.warning( + 'Current Lambda version %d is < version %d from previous analysis', 0, 1 + ) + ]) + mock_table.assert_has_calls([mock.call.Table().put_item(Item=mock.ANY)]) if __name__ == '__main__': diff --git a/tests/lambda_functions/analyzer/main_test.py b/tests/lambda_functions/analyzer/main_test.py index 370301f..f34eec6 100644 --- a/tests/lambda_functions/analyzer/main_test.py +++ b/tests/lambda_functions/analyzer/main_test.py @@ -2,36 +2,54 @@ import hashlib import json import os +import tempfile import unittest from unittest import mock import urllib -import boto3 from pyfakefs import fake_filesystem_unittest -from lambda_functions.analyzer import main +from lambda_functions.analyzer.common import COMPILED_RULES_FILEPATH from tests import boto3_mocks, yara_mocks -# Mock S3 bucket and object. +# Mock S3 bucket and objects. MOCK_S3_BUCKET_NAME = 'mock-bucket' -MOCK_FILE_CONTENTS = 'Hello, evil world!\n' -MOCK_FILE_METADATA = { - 'observed_path': '/path/to/mock-evil.exe', - 'reported_md5': 'REPORTED MD5' -} -MOCK_S3_OBJECT_KEY = 'space plus+file.test' - -# Mimics minimal parts of S3:ObjectAdded event that triggers the lambda function. -LAMBDA_VERSION = 1 -TEST_CONTEXT = boto3_mocks.MockLambdaContext(LAMBDA_VERSION) +FILE_MODIFIED_TIME = 'test-last-modified' +GOOD_FILE_CONTENTS = 'Hello, world!\n' +GOOD_FILE_METADATA = {'filepath': 'win32'} +GOOD_S3_OBJECT_KEY = 'space plus+file.test' +EVIL_FILE_CONTENTS = 'Hello, evil world!\n' +EVIL_FILE_METADATA = {'filepath': '/path/to/mock-evil.exe'} +EVIL_S3_OBJECT_KEY = 'evil.exe' MOCK_DYNAMO_TABLE_NAME = 'mock-dynamo-table' -HASH_KEY = 'SHA256' -RANGE_KEY = 'LambdaVersion' MOCK_SNS_TOPIC_ARN = 's3:mock-sns-arn' MOCK_SQS_URL = 'https://sqs.mock.url' MOCK_SQS_RECEIPTS = ['sqs_receipt1', 'sqs_receipt2'] +# Mimics minimal parts of S3:ObjectAdded event that triggers the lambda function. +LAMBDA_VERSION = 1 +TEST_CONTEXT = boto3_mocks.MockLambdaContext(LAMBDA_VERSION) + + +class MockS3Object(object): + """Simple mock for boto3.resource('s3').Object""" + def __init__(self, bucket_name, object_key): + self.name = bucket_name + self.key = object_key + + def download_file(self, download_path): + with open(download_path, 'w') as f: + f.write(GOOD_FILE_CONTENTS if self.key == GOOD_S3_OBJECT_KEY else EVIL_FILE_CONTENTS) + + @property + def last_modified(self): + return FILE_MODIFIED_TIME + + @property + def metadata(self): + return GOOD_FILE_METADATA if self.key == GOOD_S3_OBJECT_KEY else EVIL_FILE_METADATA + class MainTest(fake_filesystem_unittest.TestCase): """Test end-to-end functionality of the analyzer.""" @@ -47,151 +65,95 @@ def setUp(self): # Set up the fake filesystem. self.setUpPyfakefs() - os.mkdir('/tmp') - os.makedirs(os.path.dirname(main.COMPILED_RULES_FILEPATH)) - yara_mocks.save_test_yara_rules(main.COMPILED_RULES_FILEPATH) - - # Mock cloudwatch client. - self._mock_cloudwatch_client = boto3_mocks.MockCloudwatchCient() - - # Create a mock Dynamo table. - self._mock_dynamo_client = boto3_mocks.MockDynamoDBClient( - MOCK_DYNAMO_TABLE_NAME, HASH_KEY, RANGE_KEY) - self._mock_dynamo_table = self._mock_dynamo_client.tables[MOCK_DYNAMO_TABLE_NAME] - os.environ['YARA_MATCHES_DYNAMO_TABLE_NAME'] = MOCK_DYNAMO_TABLE_NAME + os.makedirs(os.path.dirname(COMPILED_RULES_FILEPATH)) + os.makedirs(tempfile.gettempdir()) + yara_mocks.save_test_yara_rules(COMPILED_RULES_FILEPATH) - # Create a mock S3 bucket and "upload" a file to it. - self._mock_s3_client = boto3_mocks.MockS3Client( - MOCK_S3_BUCKET_NAME, MOCK_S3_OBJECT_KEY, MOCK_FILE_CONTENTS, MOCK_FILE_METADATA) + # Set environment variables. os.environ['S3_BUCKET_NAME'] = MOCK_S3_BUCKET_NAME - - # Create mock SNS topic. - self._mock_sns_client = boto3_mocks.MockSNSClient() - os.environ['YARA_ALERTS_SNS_TOPIC_ARN'] = MOCK_SNS_TOPIC_ARN - - # Create mock SQS queue. - self._mock_sqs_client = boto3_mocks.MockSQSClient(MOCK_SQS_URL, MOCK_SQS_RECEIPTS) os.environ['SQS_QUEUE_URL'] = MOCK_SQS_URL - - # Enable the boto3 mocks. - self._real_boto3_client = boto3.client - boto3.client = mock.MagicMock(side_effect=self._boto3_client_mock) + os.environ['YARA_MATCHES_DYNAMO_TABLE_NAME'] = MOCK_DYNAMO_TABLE_NAME + os.environ['YARA_ALERTS_SNS_TOPIC_ARN'] = MOCK_SNS_TOPIC_ARN # Create test event. self._test_event = { - 'S3Objects': [urllib.parse.quote_plus(MOCK_S3_OBJECT_KEY)], + # Two objects, which match different YARA rules. + 'S3Objects': [urllib.parse.quote_plus(GOOD_S3_OBJECT_KEY), EVIL_S3_OBJECT_KEY], 'SQSReceipts': MOCK_SQS_RECEIPTS } - def tearDown(self): - """Restore boto3.client to its original.""" - boto3.client = self._real_boto3_client + # Import the module under test (now that YARA is mocked out). + with mock.patch('boto3.client'), mock.patch('boto3.resource'): + from lambda_functions.analyzer import main + self.main = main + + # Reset each boto3 resource (sometimes necessary depending on import order). + self.main.analyzer_aws_lib.CLOUDWATCH = mock.MagicMock() + self.main.analyzer_aws_lib.DYNAMODB = mock.MagicMock() + self.main.analyzer_aws_lib.S3 = mock.MagicMock() + self.main.analyzer_aws_lib.SNS = mock.MagicMock() + self.main.analyzer_aws_lib.SQS = mock.MagicMock() + + # Mock S3 Object + self.main.analyzer_aws_lib.S3.Object = MockS3Object @classmethod def tearDownClass(cls): """Restore YARA calls to their original.""" yara_mocks.disable_yara_mocks() - def _boto3_client_mock(self, service_name): - """Return one of the internal mocks for boto3.client().""" - service_map = { - 'cloudwatch': self._mock_cloudwatch_client, - 'dynamodb': self._mock_dynamo_client, - 's3': self._mock_s3_client, - 'sns': self._mock_sns_client, - 'sqs': self._mock_sqs_client - } - return service_map[service_name] - - def test_new_matching_file_added(self): - """Verify return value, Dynamo update, and SNS alert when a new file matches a YARA rule.""" - md5 = hashlib.md5(MOCK_FILE_CONTENTS.encode('utf-8')).hexdigest() - sha = hashlib.sha256(MOCK_FILE_CONTENTS.encode('utf-8')).hexdigest() - result = main.analyze_lambda_handler(self._test_event, TEST_CONTEXT) + def test_analyze_lambda_handler(self): + """Verify return value, logging, and boto3 calls when multiple files match YARA rules.""" + with mock.patch.object(self.main, 'LOGGER') as mock_logger: + result = self.main.analyze_lambda_handler(self._test_event, TEST_CONTEXT) + # Verify logging statements. + mock_logger.assert_has_calls([ + mock.call.info('Processing %d record(s)', 2), + mock.call.info('Analyzing "%s"', GOOD_S3_OBJECT_KEY), + mock.call.warning( + '%s matched YARA rules: %s', + mock.ANY, + {'externals.yar:filename_contains_win32'} + ), + mock.call.info('Analyzing "%s"', EVIL_S3_OBJECT_KEY), + mock.call.warning( + '%s matched YARA rules: %s', + mock.ANY, + {'evil_check.yar:contains_evil', 'externals.yar:extension_is_exe'} + ) + ]) # Verify return value. - s3_id = 'S3:{}:{}'.format(MOCK_S3_BUCKET_NAME, MOCK_S3_OBJECT_KEY) + good_s3_id = 'S3:{}:{}'.format(MOCK_S3_BUCKET_NAME, GOOD_S3_OBJECT_KEY) + evil_s3_id = 'S3:{}:{}'.format(MOCK_S3_BUCKET_NAME, EVIL_S3_OBJECT_KEY) expected = { - s3_id: { + good_s3_id: { 'FileInfo': { - 'ComputedMD5': md5, - 'ComputedSHA256': sha, - 'ReportedMD5': MOCK_FILE_METADATA['reported_md5'], - 'S3Location': s3_id, - 'SamplePath': MOCK_FILE_METADATA['observed_path'] + 'MD5': hashlib.md5(GOOD_FILE_CONTENTS.encode('utf-8')).hexdigest(), + 'S3LastModified': FILE_MODIFIED_TIME, + 'S3Location': good_s3_id, + 'S3Metadata': GOOD_FILE_METADATA, + 'SHA256': hashlib.sha256(GOOD_FILE_CONTENTS.encode('utf-8')).hexdigest() }, - 'NumMatchedRules': 2, 'MatchedRules': { 'Rule1': { - 'MatchedStrings': ['$evil_string'], - 'Meta': { - 'author': 'Austin Byers', - 'description': ('A helpful description about why this rule matches ' - 'dastardly evil files.') - }, - 'RuleFile': 'evil_check.yar', - 'RuleName': 'contains_evil', - 'RuleTags': ['mock_rule', 'has_meta'] - }, - 'Rule2': { 'MatchedStrings': [], 'Meta': {}, 'RuleFile': 'externals.yar', - 'RuleName': 'extension_is_exe', + 'RuleName': 'filename_contains_win32', 'RuleTags': ['mock_rule'] } - } - } - } - - self.assertEqual(expected, result) - - # Verify that the return value can be encoded as JSON. - json.dumps(result) - - # Verify that a new entry was made to Dynamo with all of the expected data. - key_value_dict = self._mock_dynamo_table.items[(sha, str(LAMBDA_VERSION))].key_value_dict - for expected in [md5, MOCK_S3_OBJECT_KEY, 'evil_check.yar:contains_evil']: - self.assertIn(expected, str(key_value_dict.values())) - - # Verify that an alert was published to SNS. - alert = self._mock_sns_client.topics[MOCK_SNS_TOPIC_ARN][0]['Message'] - for data in [md5, sha, 'evil_check.yar', 'externals.yar', s3_id]: - self.assertIn(data, alert) - - # Verify that the SQS receipts were deleted. - self.assertEqual([], self._mock_sqs_client.queues[MOCK_SQS_URL]) - - # Verify that the correct metrics were published to Cloudwatch. - expected_metrics = { - 'AnalyzedBinaries': 1, 'MatchedBinaries': 1, 'YaraRules': 3, 'LambdaVersion': 1 - } - for metric in self._mock_cloudwatch_client.metric_data['BinaryAlert']: - if metric['MetricName'] in expected_metrics: - self.assertEqual(expected_metrics[metric['MetricName']], metric['Value']) - - # Verify that the downloaded file was removed from /tmp. - self.assertEqual([], os.listdir('/tmp')) - - def test_multiple_records(self): - """Verify that results are returned for multiple records.""" - # Add two different files to mock S3. - self._mock_s3_client.buckets[MOCK_S3_BUCKET_NAME]['KEY2'] = ('Evilicious', {}) - self._mock_s3_client.buckets[MOCK_S3_BUCKET_NAME]['KEY3'] = ('', {'observed_path': 'win32'}) - self._test_event['S3Objects'] = ['KEY2', 'KEY3'] - - # Verify return value. - result = main.analyze_lambda_handler(self._test_event, TEST_CONTEXT) - expected = { - 'S3:{}:KEY2'.format(MOCK_S3_BUCKET_NAME): { + }, + 'NumMatchedRules': 1 + }, + evil_s3_id: { 'FileInfo': { - 'ComputedMD5': hashlib.md5('Evilicious'.encode('utf-8')).hexdigest(), - 'ComputedSHA256': hashlib.sha256('Evilicious'.encode('utf-8')).hexdigest(), - 'ReportedMD5': '', - 'S3Location': 'S3:{}:KEY2'.format(MOCK_S3_BUCKET_NAME), - 'SamplePath': '' + 'MD5': hashlib.md5(EVIL_FILE_CONTENTS.encode('utf-8')).hexdigest(), + 'S3LastModified': FILE_MODIFIED_TIME, + 'S3Location': evil_s3_id, + 'S3Metadata': EVIL_FILE_METADATA, + 'SHA256': hashlib.sha256(EVIL_FILE_CONTENTS.encode('utf-8')).hexdigest() }, - 'NumMatchedRules': 1, 'MatchedRules': { 'Rule1': { 'MatchedStrings': ['$evil_string'], @@ -203,41 +165,88 @@ def test_multiple_records(self): 'RuleFile': 'evil_check.yar', 'RuleName': 'contains_evil', 'RuleTags': ['mock_rule', 'has_meta'] - } - } - }, - 'S3:{}:KEY3'.format(MOCK_S3_BUCKET_NAME): { - 'FileInfo': { - 'ComputedMD5': hashlib.md5(''.encode('utf-8')).hexdigest(), - 'ComputedSHA256': hashlib.sha256(''.encode('utf-8')).hexdigest(), - 'ReportedMD5': '', - 'S3Location': 'S3:{}:KEY3'.format(MOCK_S3_BUCKET_NAME), - 'SamplePath': 'win32' - }, - 'NumMatchedRules': 1, - 'MatchedRules': { - 'Rule1': { + }, + 'Rule2': { 'MatchedStrings': [], 'Meta': {}, 'RuleFile': 'externals.yar', - 'RuleName': 'filename_contains_win32', + 'RuleName': 'extension_is_exe', 'RuleTags': ['mock_rule'] } - } + }, + 'NumMatchedRules': 2 } } + self.assertEqual(expected, result) - # Verify that return value can be encoded as JSON. + # Verify that the return value can be encoded as JSON. json.dumps(result) - # Verify cloudwatch metrics. - expected_metrics = { - 'AnalyzedBinaries': 2, 'MatchedBinaries': 2, 'YaraRules': 3, 'LambdaVersion': 1 - } - for metric in self._mock_cloudwatch_client.metric_data['BinaryAlert']: - if metric['MetricName'] in expected_metrics: - self.assertEqual(expected_metrics[metric['MetricName']], metric['Value']) + # Verify that the Dynamo table was created. + self.main.analyzer_aws_lib.DYNAMODB.assert_has_calls([ + mock.call.Table(MOCK_DYNAMO_TABLE_NAME) + ]) + + # Verify an SNS message was published. + self.main.analyzer_aws_lib.SNS.assert_has_calls([ + mock.call.Topic(MOCK_SNS_TOPIC_ARN), + mock.call.Topic().publish( + Message=mock.ANY, + Subject='[BinaryAlert] win32 matches a YARA rule' + ), + mock.call.Topic(MOCK_SNS_TOPIC_ARN), + mock.call.Topic().publish( + Message=mock.ANY, + Subject='[BinaryAlert] /path/to/mock-evil.exe matches a YARA rule' + ) + ]) + + # Verify the SQS receipts were deleted. + self.main.analyzer_aws_lib.SQS.assert_has_calls([ + mock.call.Queue(MOCK_SQS_URL), + mock.call.Queue().delete_messages(Entries=[ + {'Id': '0', 'ReceiptHandle': 'sqs_receipt1'}, + {'Id': '1', 'ReceiptHandle': 'sqs_receipt2'} + ]) + ]) + + # Verify the correct metrics were published to Cloudwatch. + self.main.analyzer_aws_lib.CLOUDWATCH.assert_has_calls([ + mock.call.put_metric_data( + MetricData=[ + { + 'MetricName': 'AnalyzedBinaries', + 'Value': 2, + 'Unit': 'Count' + }, + { + 'MetricName': 'MatchedBinaries', + 'Value': 2, + 'Unit': 'Count' + }, + { + 'MetricName': 'YaraRules', + 'Value': 3, + 'Unit': 'Count' + }, + { + 'MetricName': 'S3DownloadLatency', + 'StatisticValues': { + 'Minimum': mock.ANY, + 'Maximum': mock.ANY, + 'SampleCount': 2, + 'Sum': mock.ANY + }, + 'Unit': 'Milliseconds' + } + ], + Namespace='BinaryAlert' + ) + ]) + + # Verify that the downloaded file was removed from temp storage. + self.assertEqual([], os.listdir(tempfile.gettempdir())) if __name__ == '__main__': diff --git a/tests/lambda_functions/build_test.py b/tests/lambda_functions/build_test.py index 07a0d84..62dd4c7 100644 --- a/tests/lambda_functions/build_test.py +++ b/tests/lambda_functions/build_test.py @@ -37,6 +37,7 @@ def test_build_analyzer(self, mock_print): '__init__.py', 'analyzer_aws_lib.py', 'binary_info.py', + 'common.py', 'compiled_yara_rules.bin', 'file_hash.py', 'libpython3.5m.so.1.0', diff --git a/tests/lambda_functions/downloader/main_test.py b/tests/lambda_functions/downloader/main_test.py index 1ec2b4d..906d66f 100644 --- a/tests/lambda_functions/downloader/main_test.py +++ b/tests/lambda_functions/downloader/main_test.py @@ -88,11 +88,10 @@ def test_download_from_carbon_black(self): expected_metadata = { 'carbon_black_group': 'Production,Laptops', 'carbon_black_host_count': '2', - 'carbon_black_last_seen': 'sometime-recently', 'carbon_black_md5': 'ABC123', - 'carbon_black_observed_filename': '/Users/name/file.txt', 'carbon_black_os_type': 'Linux', - 'carbon_black_virustotal_score': '0' + 'carbon_black_virustotal_score': '0', + 'filepath': '/Users/name/file.txt' } self.download_main.S3_BUCKET.assert_has_calls([