Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 113 additions & 104 deletions lambda_functions/analyzer/analyzer_aws_lib.py

Large diffs are not rendered by default.

82 changes: 45 additions & 37 deletions lambda_functions/analyzer/binary_info.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,32 @@
"""Keeps track of all information associated with and computed about a binary."""
import logging
import os
import tempfile
import time
from typing import Any, Dict, Set
import uuid

if __package__:
# Imported by unit tests or other external code.
from lambda_functions.analyzer import analyzer_aws_lib, file_hash
from lambda_functions.analyzer.common import LOGGER
from lambda_functions.analyzer.yara_analyzer import YaraAnalyzer
else:
import analyzer_aws_lib
from common import LOGGER
import file_hash

LOGGER = logging.getLogger()
from yara_analyzer import YaraAnalyzer


class BinaryInfo(object):
"""Organizes the analysis of a single binary blob in S3."""

def __init__(self, bucket_name, object_key, yara_analyzer):
def __init__(self, bucket_name: str, object_key: str, yara_analyzer: YaraAnalyzer):
"""Create a new BinaryInfo.

Args:
bucket_name: [string] S3 bucket name.
object_key: [string] S3 object key.
yara_analyzer: [YaraAnalyzer] built from a compiled rules file.
bucket_name: S3 bucket name.
object_key: S3 object key.
yara_analyzer: Analyzer built from a compiled rules file.
"""
self.bucket_name = bucket_name
self.object_key = object_key
Expand All @@ -36,27 +38,34 @@ def __init__(self, bucket_name, object_key, yara_analyzer):

# Computed after file download and analysis.
self.download_time_ms = 0
self.reported_md5 = self.observed_path = ''
self.computed_sha = self.computed_md5 = None
self.s3_last_modified = ''
self.s3_metadata = {}
self.computed_md5 = None
self.computed_sha = None
self.yara_matches = [] # List of yara.Match objects.

@property
def matched_rule_ids(self):
"""A list of 'yara_file:rule_name' for each YARA match."""
return ['{}:{}'.format(match.namespace, match.rule) for match in self.yara_matches]

def __str__(self):
"""Use the S3 identifier as the string representation of the binary."""
return self.s3_identifier

def _download_from_s3(self):
"""Download binary from S3 and measure elapsed time."""
LOGGER.debug('Downloading %s to %s', self.object_key, self.download_path)

start_time = time.time()
self.s3_last_modified, self.s3_metadata = analyzer_aws_lib.download_from_s3(
self.bucket_name, self.object_key, self.download_path)
self.download_time_ms = (time.time() - start_time) * 1000

def __enter__(self):
"""Download the binary from S3 and run YARA analysis."""
self._download_from_s3()
self.computed_sha, self.computed_md5 = file_hash.compute_hashes(self.download_path)

LOGGER.debug('Running YARA analysis')
self.yara_matches = self.yara_analyzer.analyze(
self.download_path, original_target_path=self.observed_path)
self.download_path, original_target_path=self.filepath
)

return self

Expand All @@ -69,46 +78,45 @@ def __exit__(self, exception_type, exception_value, traceback):
file.truncate()
os.remove(self.download_path)

def _download_from_s3(self):
"""Download binary from S3 and measure elapsed time."""
LOGGER.debug('Downloading %s to %s', self.object_key, self.download_path)

start_time = time.time()
s3_metadata = analyzer_aws_lib.download_from_s3(
self.bucket_name, self.object_key, self.download_path)
self.download_time_ms = (time.time() - start_time) * 1000
@property
def matched_rule_ids(self) -> Set[str]:
"""A list of 'yara_file:rule_name' for each YARA match."""
return set('{}:{}'.format(match.namespace, match.rule) for match in self.yara_matches)

self.reported_md5 = s3_metadata.get('reported_md5', '')
self.observed_path = s3_metadata.get('observed_path', '')
@property
def filepath(self) -> str:
"""The filepath from the S3 metadata, if present."""
return self.s3_metadata.get('filepath', '')

def save_matches_and_alert(self, lambda_version, dynamo_table_name, sns_topic_arn):
def save_matches_and_alert(
self, analyzer_version: int, dynamo_table_name: str, sns_topic_arn: str) -> None:
"""Save match results to Dynamo and publish an alert to SNS if appropriate.

Args:
lambda_version: [int] The currently executing version of the Lambda function.
dynamo_table_name: [string] Save YARA match results to this Dynamo table.
sns_topic_arn: [string] Publish match alerts to this SNS topic ARN.
analyzer_version: The currently executing version of the Lambda function.
dynamo_table_name: Save YARA match results to this Dynamo table.
sns_topic_arn: Publish match alerts to this SNS topic ARN.
"""
table = analyzer_aws_lib.DynamoMatchTable(dynamo_table_name)
needs_alert = table.save_matches(self, lambda_version)
needs_alert = table.save_matches(self, analyzer_version)

# Send alert if appropriate.
if needs_alert:
LOGGER.info('Publishing an SNS alert')
analyzer_aws_lib.publish_alert_to_sns(self, sns_topic_arn)

def summary(self):
def summary(self) -> Dict[str, Any]:
"""Generate a summary dictionary of binary attributes."""
result = {
'FileInfo': {
'ComputedMD5': self.computed_md5,
'ComputedSHA256': self.computed_sha,
'ReportedMD5': self.reported_md5,
'MD5': self.computed_md5,
'S3LastModified': self.s3_last_modified,
'S3Location': self.s3_identifier,
'SamplePath': self.observed_path
'S3Metadata': self.s3_metadata,
'SHA256': self.computed_sha
},
'NumMatchedRules': len(self.yara_matches),
'MatchedRules': {}
'MatchedRules': {},
'NumMatchedRules': len(self.yara_matches)
}

for index, match in enumerate(self.yara_matches, start=1):
Expand Down
11 changes: 11 additions & 0 deletions lambda_functions/analyzer/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Common resources shared among the analyzer components."""
import logging
import os

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.INFO)

# Define the name and location of the compiled YARA rules file.
COMPILED_RULES_FILENAME = 'compiled_yara_rules.bin'
THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) # Directory containing this file.
COMPILED_RULES_FILEPATH = os.path.join(THIS_DIRECTORY, COMPILED_RULES_FILENAME)
14 changes: 8 additions & 6 deletions lambda_functions/analyzer/file_hash.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
"""Memory-efficient file hashing."""
import hashlib
import io
from typing import Tuple

MB = 2 ** 20 # ~ 1 million bytes


def _read_in_chunks(file_object, chunk_size=2*MB):
def _read_in_chunks(file_object: io.FileIO, chunk_size: int = 2*MB) -> str:
"""Read a file in fixed-size chunks (to minimize memory usage for large files).

Args:
file_object: An opened file-like object supporting read().
chunk_size: [int] Max size (in bytes) of each file chunk.
chunk_size: Max size (in bytes) of each file chunk.

Yields:
[string] file chunks, each of size at most chunk_size.
File chunks, each of size at most chunk_size.
"""
while True:
chunk = file_object.read(chunk_size)
Expand All @@ -22,16 +24,16 @@ def _read_in_chunks(file_object, chunk_size=2*MB):
return # End of file.


def compute_hashes(file_path):
def compute_hashes(file_path: str) -> Tuple[str, str]:
"""Compute SHA and MD5 hashes for the specified file object.

The MD5 is only included to be compatible with other security tools.

Args:
file_path: [string] File path to be analyzed.
file_path: File path to be analyzed.

Returns:
String tuple (sha_hash, md5_hash).
SHA256 hash, MD5 hash.
"""
sha = hashlib.sha256()
md5 = hashlib.md5()
Expand Down
43 changes: 16 additions & 27 deletions lambda_functions/analyzer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,31 @@
# YARA_MATCHES_DYNAMO_TABLE_NAME: Name of the Dynamo table which stores YARA match results.
# YARA_ALERTS_SNS_TOPIC_ARN: ARN of the SNS topic which should be alerted on a YARA match.
# Expects a binary YARA rules file to be at './compiled_yara_rules.bin'
import logging
import os
from typing import Any, Dict
import urllib

from yara import Error as YaraError
from botocore.exceptions import ClientError as BotoError

if __package__:
# Imported by unit tests or other external code.
from lambda_functions.analyzer import analyzer_aws_lib, binary_info, yara_analyzer
from lambda_functions.analyzer.common import COMPILED_RULES_FILEPATH, LOGGER
else:
import analyzer_aws_lib
import binary_info
from common import COMPILED_RULES_FILEPATH, LOGGER
import yara_analyzer

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.INFO)

THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) # Directory containing this file.
COMPILED_RULES_FILENAME = 'compiled_yara_rules.bin' # Binary YARA rules file.
COMPILED_RULES_FILEPATH = os.path.join(THIS_DIRECTORY, COMPILED_RULES_FILENAME)

# Build the YaraAnalyzer from the compiled rules file at import time (i.e. once per container).
# This saves 50-100+ ms per Lambda invocation, depending on the size of the rules file.
# However, this breaks imports when the compiled rules file doesn't exist (e.g. unit tests).
# Fail-safe to computing the ANALYZER during the handler if we have to.
try:
ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH)
# Due to a bug in yara-python, num_rules only be computed once. Thereafter, it will return 0.
# So we have to compute this here since multiple invocations may share the same analyzer.
NUM_YARA_RULES = ANALYZER.num_rules
except YaraError:
ANALYZER = None
ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH)
# Due to a bug in yara-python, num_rules only be computed once. Thereafter, it will return 0.
# So we have to compute this here since multiple invocations may share the same analyzer.
NUM_YARA_RULES = ANALYZER.num_rules


def analyze_lambda_handler(event_data, lambda_context):
def analyze_lambda_handler(event_data: Dict[str, Any], lambda_context) -> Dict[str, Dict[str, Any]]:
"""Lambda function entry point.

Args:
Expand All @@ -52,18 +41,18 @@ def analyze_lambda_handler(event_data, lambda_context):
lambda_context: LambdaContext object (with .function_version).

Returns:
A dict mapping S3 object identifier [string] to a summary [dict] of file info and matched
YARA rule information.
A dict mapping S3 object identifier to a summary of file info and matched YARA rules.
Example: {
'S3:bucket:key': {
'FileInfo': { ... },
'MatchedRules': { ... },
'NumMatchedRules': 1
}
}
"""
result = {}
binaries = [] # List of the BinaryInfo data.

# Build the YaraAnalyzer now if we could not do it when this file was imported.
global ANALYZER, NUM_YARA_RULES # pylint: disable=global-statement
if not ANALYZER:
ANALYZER = yara_analyzer.YaraAnalyzer(COMPILED_RULES_FILEPATH)
NUM_YARA_RULES = ANALYZER.num_rules

# The Lambda version must be an integer.
try:
lambda_version = int(lambda_context.function_version)
Expand Down
19 changes: 10 additions & 9 deletions lambda_functions/analyzer/yara_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,35 @@
"""Wrapper around YARA analysis."""
import os
from typing import Dict, List

import yara


class YaraAnalyzer(object):
"""Encapsulates YARA analysis and matching functions."""

def __init__(self, rules_file):
def __init__(self, rules_file: str):
"""Initialize the analyzer with a prebuilt binary YARA rules file.

Args:
rules_file: [string] Path to the binary rules file.
rules_file: Path to the binary rules file.
"""
self._rules = yara.load(rules_file)

@property
def num_rules(self):
def num_rules(self) -> int:
"""Count the number of YARA rules loaded in the analyzer."""
return sum(1 for _ in self._rules)

@staticmethod
def _yara_variables(original_target_path):
def _yara_variables(original_target_path: str) -> Dict[str, str]:
"""Compute external variables needed for some YARA rules.

Args:
original_target_path: [string] Path where the binary was originally discovered.
original_target_path: Path where the binary was originally discovered.

Returns:
A dictionary mapping string variable names to string values.
A map from YARA variable names to their computed values.
"""
file_name = os.path.basename(original_target_path)
file_suffix = file_name.split('.')[-1] if '.' in file_name else '' # e.g. "exe" or "rar".
Expand All @@ -39,12 +40,12 @@ def _yara_variables(original_target_path):
'filetype': file_suffix.upper() # Used in only one rule (checking for "GIF").
}

def analyze(self, target_file, original_target_path=''):
def analyze(self, target_file: str, original_target_path: str = '') -> List:
"""Run YARA analysis on a file.

Args:
target_file: [string] Local path to target file to be analyzed.
original_target_path: [string] Path where the target file was originally discovered.
target_file: Local path to target file to be analyzed.
original_target_path: Path where the target file was originally discovered.

Returns:
List of yara.Match objects.
Expand Down
2 changes: 1 addition & 1 deletion lambda_functions/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pip

from lambda_functions.analyzer.main import COMPILED_RULES_FILENAME
from lambda_functions.analyzer.common import COMPILED_RULES_FILENAME
from rules.compile_rules import compile_rules

LAMBDA_DIR = os.path.dirname(os.path.realpath(__file__))
Expand Down
11 changes: 5 additions & 6 deletions lambda_functions/downloader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,18 @@ def _download_from_carbon_black(binary: Binary) -> str:
@backoff.on_exception(backoff.expo, (ObjectNotFoundError, zipfile.BadZipFile), max_tries=8,
jitter=backoff.full_jitter)
def _build_metadata(binary: Binary) -> Dict[str, str]:
"""Return basic CarbonBlack metadata to make it easier to triage YARA match alerts."""
"""Return basic metadata to make it easier to triage YARA match alerts."""
LOGGER.info('Retrieving binary metadata')
return {
'carbon_black_group': ','.join(binary.group),
'carbon_black_host_count': str(binary.host_count),
'carbon_black_last_seen': binary.last_seen,
'carbon_black_md5': binary.md5,
'carbon_black_observed_filename': (
'carbon_black_os_type': binary.os_type,
'carbon_black_virustotal_score': str(binary.virustotal.score),
'filepath': (
# Throw out any non-ascii characters (S3 metadata must be ascii).
binary.observed_filenames[0].encode('ascii', 'ignore').decode('ascii')
),
'carbon_black_os_type': binary.os_type,
'carbon_black_virustotal_score': str(binary.virustotal.score)
)
}


Expand Down
Loading