airbnb · austinbyers · Sep 6, 2017 · Sep 6, 2017 · Sep 6, 2017 · Sep 6, 2017
diff --git a/lambda_functions/analyzer/analyzer_aws_lib.py b/lambda_functions/analyzer/analyzer_aws_lib.py
diff --git a/lambda_functions/analyzer/binary_info.py b/lambda_functions/analyzer/binary_info.py
@@ -1,30 +1,32 @@
 """Keeps track of all information associated with and computed about a binary."""
-import logging
 import os
 import tempfile
 import time
+from typing import Any, Dict, Set
 import uuid
 
 if __package__:
     # Imported by unit tests or other external code.
     from lambda_functions.analyzer import analyzer_aws_lib, file_hash
+    from lambda_functions.analyzer.common import LOGGER
+    from lambda_functions.analyzer.yara_analyzer import YaraAnalyzer
 else:
     import analyzer_aws_lib
+    from common import LOGGER
     import file_hash
-
-LOGGER = logging.getLogger()
+    from yara_analyzer import YaraAnalyzer
 
 
 class BinaryInfo(object):
     """Organizes the analysis of a single binary blob in S3."""
 
-    def __init__(self, bucket_name, object_key, yara_analyzer):
+    def __init__(self, bucket_name: str, object_key: str, yara_analyzer: YaraAnalyzer):
         """Create a new BinaryInfo.
 
         Args:
-            bucket_name: [string] S3 bucket name.
-            object_key: [string] S3 object key.
-            yara_analyzer: [YaraAnalyzer] built from a compiled rules file.
+            bucket_name: S3 bucket name.
+            object_key: S3 object key.
+            yara_analyzer: Analyzer built from a compiled rules file.
         """
         self.bucket_name = bucket_name
         self.object_key = object_key
@@ -36,27 +38,34 @@ def __init__(self, bucket_name, object_key, yara_analyzer):
 
         # Computed after file download and analysis.
         self.download_time_ms = 0
-        self.reported_md5 = self.observed_path = ''
-        self.computed_sha = self.computed_md5 = None
+        self.s3_last_modified = ''
+        self.s3_metadata = {}
+        self.computed_md5 = None
+        self.computed_sha = None
         self.yara_matches = []  # List of yara.Match objects.
 
-    @property
-    def matched_rule_ids(self):
-        """A list of 'yara_file:rule_name' for each YARA match."""
-        return ['{}:{}'.format(match.namespace, match.rule) for match in self.yara_matches]
-
     def __str__(self):
         """Use the S3 identifier as the string representation of the binary."""
         return self.s3_identifier
 
+    def _download_from_s3(self):
+        """Download binary from S3 and measure elapsed time."""
+        LOGGER.debug('Downloading %s to %s', self.object_key, self.download_path)
+
+        start_time = time.time()
+        self.s3_last_modified, self.s3_metadata = analyzer_aws_lib.download_from_s3(
+            self.bucket_name, self.object_key, self.download_path)
+        self.download_time_ms = (time.time() - start_time) * 1000
+
     def __enter__(self):
         """Download the binary from S3 and run YARA analysis."""
         self._download_from_s3()
         self.computed_sha, self.computed_md5 = file_hash.compute_hashes(self.download_path)
 
         LOGGER.debug('Running YARA analysis')
         self.yara_matches = self.yara_analyzer.analyze(
-            self.download_path, original_target_path=self.observed_path)
+            self.download_path, original_target_path=self.filepath
+        )
 
         return self
 
@@ -69,46 +78,45 @@ def __exit__(self, exception_type, exception_value, traceback):
                 file.truncate()
             os.remove(self.download_path)
 
-    def _download_from_s3(self):
-        """Download binary from S3 and measure elapsed time."""
-        LOGGER.debug('Downloading %s to %s', self.object_key, self.download_path)
-
-        start_time = time.time()
-        s3_metadata = analyzer_aws_lib.download_from_s3(
-            self.bucket_name, self.object_key, self.download_path)
-        self.download_time_ms = (time.time() - start_time) * 1000
+    @property
+    def matched_rule_ids(self) -> Set[str]:
+        """A list of 'yara_file:rule_name' for each YARA match."""
+        return set('{}:{}'.format(match.namespace, match.rule) for match in self.yara_matches)
 
-        self.reported_md5 = s3_metadata.get('reported_md5', '')
-        self.observed_path = s3_metadata.get('observed_path', '')
+    @property
+    def filepath(self) -> str:
+        """The filepath from the S3 metadata, if present."""
+        return self.s3_metadata.get('filepath', '')
 
-    def save_matches_and_alert(self, lambda_version, dynamo_table_name, sns_topic_arn):
+    def save_matches_and_alert(
+            self, analyzer_version: int, dynamo_table_name: str, sns_topic_arn: str) -> None:
         """Save match results to Dynamo and publish an alert to SNS if appropriate.
 
         Args:
-            lambda_version: [int] The currently executing version of the Lambda function.
-            dynamo_table_name: [string] Save YARA match results to this Dynamo table.
-            sns_topic_arn: [string] Publish match alerts to this SNS topic ARN.
+            analyzer_version: The currently executing version of the Lambda function.
+            dynamo_table_name: Save YARA match results to this Dynamo table.
+            sns_topic_arn: Publish match alerts to this SNS topic ARN.
         """
         table = analyzer_aws_lib.DynamoMatchTable(dynamo_table_name)
-        needs_alert = table.save_matches(self, lambda_version)
+        needs_alert = table.save_matches(self, analyzer_version)
 
         # Send alert if appropriate.
         if needs_alert:
             LOGGER.info('Publishing an SNS alert')
             analyzer_aws_lib.publish_alert_to_sns(self, sns_topic_arn)
 
-    def summary(self):
+    def summary(self) -> Dict[str, Any]:
         """Generate a summary dictionary of binary attributes."""
         result = {
             'FileInfo': {
-                'ComputedMD5': self.computed_md5,
-                'ComputedSHA256': self.computed_sha,
-                'ReportedMD5': self.reported_md5,
+                'MD5': self.computed_md5,
+                'S3LastModified': self.s3_last_modified,
                 'S3Location': self.s3_identifier,
-                'SamplePath': self.observed_path
+                'S3Metadata': self.s3_metadata,
+                'SHA256': self.computed_sha
             },
-            'NumMatchedRules': len(self.yara_matches),
-            'MatchedRules': {}
+            'MatchedRules': {},
+            'NumMatchedRules': len(self.yara_matches)
         }
 
         for index, match in enumerate(self.yara_matches, start=1):

diff --git a/lambda_functions/analyzer/common.py b/lambda_functions/analyzer/common.py
@@ -0,0 +1,11 @@
+"""Common resources shared among the analyzer components."""
+import logging
+import os
+
+LOGGER = logging.getLogger()
+LOGGER.setLevel(logging.INFO)
+
+# Define the name and location of the compiled YARA rules file.
+COMPILED_RULES_FILENAME = 'compiled_yara_rules.bin'
+THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))  # Directory containing this file.
+COMPILED_RULES_FILEPATH = os.path.join(THIS_DIRECTORY, COMPILED_RULES_FILENAME)
diff --git a/lambda_functions/analyzer/file_hash.py b/lambda_functions/analyzer/file_hash.py
@@ -1,18 +1,20 @@
 """Memory-efficient file hashing."""
 import hashlib
+import io
+from typing import Tuple
 
 MB = 2 ** 20  # ~ 1 million bytes
 
 
-def _read_in_chunks(file_object, chunk_size=2*MB):
+def _read_in_chunks(file_object: io.FileIO, chunk_size: int = 2*MB) -> str:
     """Read a file in fixed-size chunks (to minimize memory usage for large files).
 
     Args:
         file_object: An opened file-like object supporting read().
-        chunk_size: [int] Max size (in bytes) of each file chunk.
+        chunk_size: Max size (in bytes) of each file chunk.
 
     Yields:
-        [string] file chunks, each of size at most chunk_size.
+        File chunks, each of size at most chunk_size.
     """
     while True:
         chunk = file_object.read(chunk_size)
@@ -22,16 +24,16 @@ def _read_in_chunks(file_object, chunk_size=2*MB):
             return  # End of file.
 
 
-def compute_hashes(file_path):
+def compute_hashes(file_path: str) -> Tuple[str, str]:
     """Compute SHA and MD5 hashes for the specified file object.
 
     The MD5 is only included to be compatible with other security tools.
 
     Args:
-        file_path: [string] File path to be analyzed.
+        file_path: File path to be analyzed.
 
     Returns:
-        String tuple (sha_hash, md5_hash).
+        SHA256 hash, MD5 hash.
     """
     sha = hashlib.sha256()
     md5 = hashlib.md5()

diff --git a/lambda_functions/analyzer/main.py b/lambda_functions/analyzer/main.py
@@ -5,42 +5,31 @@
 #   YARA_MATCHES_DYNAMO_TABLE_NAME: Name of the Dynamo table which stores YARA match results.
 #   YARA_ALERTS_SNS_TOPIC_ARN: ARN of the SNS topic which should be alerted on a YARA match.
 # Expects a binary YARA rules file to be at './compiled_yara_rules.bin'
-import logging
 import os
+from typing import Any, Dict
 import urllib
 
-from yara import Error as YaraError
 from botocore.exceptions import ClientError as BotoError
 
 if __package__:
     # Imported by unit tests or other external code.
     from lambda_functions.analyzer import analyzer_aws_lib, binary_info, yara_analyzer
+    from lambda_functions.analyzer.common import COMPILED_RULES_FILEPATH, LOGGER
 else:
     import analyzer_aws_lib
     import binary_info
+    from common import COMPILED_RULES_FILEPATH, LOGGER
     import yara_analyzer
 
-LOGGER = logging.getLogger()
-LOGGER.setLevel(logging.INFO)
-
-THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))  # Directory containing this file.
-COMPILED_RULES_FILENAME = 'compiled_yara_rules.bin'  # Binary YARA rules file.
-COMPILED_RULES_FILEPATH = os.path.join(THIS_DIRECTORY, COMPILED_RULES_FILENAME)
-
 # Build the YaraAnalyzer from the compiled rules file at import time (i.e. once per container).
 # This saves 50-100+ ms per Lambda invocation, depending on the size of the rules file.
-# However, this breaks imports when the compiled rules file doesn't exist (e.g. unit tests).
-# Fail-safe to computing the ANALYZER during the handler if we have to.
-try:
-    ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH)
-    # Due to a bug in yara-python, num_rules only be computed once. Thereafter, it will return 0.
-    # So we have to compute this here since multiple invocations may share the same analyzer.
-    NUM_YARA_RULES = ANALYZER.num_rules
-except YaraError:
-    ANALYZER = None
+ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH)
+# Due to a bug in yara-python, num_rules only be computed once. Thereafter, it will return 0.
+# So we have to compute this here since multiple invocations may share the same analyzer.
+NUM_YARA_RULES = ANALYZER.num_rules
 
 
-def analyze_lambda_handler(event_data, lambda_context):
+def analyze_lambda_handler(event_data: Dict[str, Any], lambda_context) -> Dict[str, Dict[str, Any]]:
     """Lambda function entry point.
 
     Args:
@@ -52,18 +41,18 @@ def analyze_lambda_handler(event_data, lambda_context):
         lambda_context: LambdaContext object (with .function_version).
 
     Returns:
-        A dict mapping S3 object identifier [string] to a summary [dict] of file info and matched
-        YARA rule information.
+        A dict mapping S3 object identifier to a summary of file info and matched YARA rules.
+        Example: {
+            'S3:bucket:key': {
+                'FileInfo': { ... },
+                'MatchedRules': { ... },
+                'NumMatchedRules': 1
+            }
+        }
     """
     result = {}
     binaries = []  # List of the BinaryInfo data.
 
-    # Build the YaraAnalyzer now if we could not do it when this file was imported.
-    global ANALYZER, NUM_YARA_RULES  # pylint: disable=global-statement
-    if not ANALYZER:
-        ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH)
-        NUM_YARA_RULES = ANALYZER.num_rules
-
     # The Lambda version must be an integer.
     try:
         lambda_version = int(lambda_context.function_version)

diff --git a/lambda_functions/analyzer/yara_analyzer.py b/lambda_functions/analyzer/yara_analyzer.py
@@ -1,34 +1,35 @@
 """Wrapper around YARA analysis."""
 import os
+from typing import Dict, List
 
 import yara
 
 
 class YaraAnalyzer(object):
     """Encapsulates YARA analysis and matching functions."""
 
-    def __init__(self, rules_file):
+    def __init__(self, rules_file: str):
         """Initialize the analyzer with a prebuilt binary YARA rules file.
 
         Args:
-            rules_file: [string] Path to the binary rules file.
+            rules_file: Path to the binary rules file.
         """
         self._rules = yara.load(rules_file)
 
     @property
-    def num_rules(self):
+    def num_rules(self) -> int:
         """Count the number of YARA rules loaded in the analyzer."""
         return sum(1 for _ in self._rules)
 
     @staticmethod
-    def _yara_variables(original_target_path):
+    def _yara_variables(original_target_path: str) -> Dict[str, str]:
         """Compute external variables needed for some YARA rules.
 
         Args:
-            original_target_path: [string] Path where the binary was originally discovered.
+            original_target_path: Path where the binary was originally discovered.
 
         Returns:
-            A dictionary mapping string variable names to string values.
+            A map from YARA variable names to their computed values.
         """
         file_name = os.path.basename(original_target_path)
         file_suffix = file_name.split('.')[-1] if '.' in file_name else ''  # e.g. "exe" or "rar".
@@ -39,12 +40,12 @@ def _yara_variables(original_target_path):
             'filetype': file_suffix.upper()  # Used in only one rule (checking for "GIF").
         }
 
-    def analyze(self, target_file, original_target_path=''):
+    def analyze(self, target_file: str, original_target_path: str = '') -> List:
         """Run YARA analysis on a file.
 
         Args:
-            target_file: [string] Local path to target file to be analyzed.
-            original_target_path: [string] Path where the target file was originally discovered.
+            target_file: Local path to target file to be analyzed.
+            original_target_path: Path where the target file was originally discovered.
 
         Returns:
             List of yara.Match objects.

diff --git a/lambda_functions/build.py b/lambda_functions/build.py
@@ -7,7 +7,7 @@
 
 import pip
 
-from lambda_functions.analyzer.main import COMPILED_RULES_FILENAME
+from lambda_functions.analyzer.common import COMPILED_RULES_FILENAME
 from rules.compile_rules import compile_rules
 
 LAMBDA_DIR = os.path.dirname(os.path.realpath(__file__))

diff --git a/lambda_functions/downloader/main.py b/lambda_functions/downloader/main.py
@@ -56,19 +56,18 @@ def _download_from_carbon_black(binary: Binary) -> str:
 @backoff.on_exception(backoff.expo, (ObjectNotFoundError, zipfile.BadZipFile), max_tries=8,
                       jitter=backoff.full_jitter)
 def _build_metadata(binary: Binary) -> Dict[str, str]:
-    """Return basic CarbonBlack metadata to make it easier to triage YARA match alerts."""
+    """Return basic metadata to make it easier to triage YARA match alerts."""
     LOGGER.info('Retrieving binary metadata')
     return {
         'carbon_black_group': ','.join(binary.group),
         'carbon_black_host_count': str(binary.host_count),
-        'carbon_black_last_seen': binary.last_seen,
         'carbon_black_md5': binary.md5,
-        'carbon_black_observed_filename': (
+        'carbon_black_os_type': binary.os_type,
+        'carbon_black_virustotal_score': str(binary.virustotal.score),
+        'filepath': (
             # Throw out any non-ascii characters (S3 metadata must be ascii).
             binary.observed_filenames[0].encode('ascii', 'ignore').decode('ascii')
-        ),
-        'carbon_black_os_type': binary.os_type,
-        'carbon_black_virustotal_score': str(binary.virustotal.score)
+        )
     }