Skip to content

Commit

Permalink
Cleanup before v1.0 (#58)
Browse files Browse the repository at this point in the history
  • Loading branch information
austinbyers committed Sep 12, 2017
1 parent 4298b99 commit 54b0589
Show file tree
Hide file tree
Showing 36 changed files with 734 additions and 410 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ compiled_yara_rules.bin
# Coverage
.coverage

# Mypy
.mypy_cache/

# MacOS artifacts
Thumbs.db
.DS_Store
Expand Down
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ script:
- coverage run manage.py unit_test
- coverage report # Required coverage threshold specified in .coveragerc
- find . -name '*.py' -exec pylint '{}' +
- mypy . --ignore-missing-imports
after_success:
- coveralls
14 changes: 7 additions & 7 deletions lambda_functions/analyzer/analyzer_aws_lib.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Collection of boto3 calls to AWS resources for the analyzer function."""
import json
from typing import Dict, List, Set, Tuple, Union
from typing import Dict, List, Optional, Set, Tuple, Union

import boto3
from boto3.dynamodb.conditions import Key
Expand All @@ -9,8 +9,9 @@
from lambda_functions.analyzer.binary_info import BinaryInfo
from lambda_functions.analyzer.common import LOGGER
else:
from binary_info import BinaryInfo
from common import LOGGER
# mypy complains about duplicate definitions
from binary_info import BinaryInfo # type: ignore
from common import LOGGER # type: ignore

SNS_PUBLISH_SUBJECT_MAX_SIZE = 99

Expand Down Expand Up @@ -153,16 +154,15 @@ class DynamoMatchTable(object):
S3Objects (Set[str]): A set of S3 keys containing the corresponding binary.
Duplicate uploads (multiple binaries with the same SHA) are allowed.
"""
def __init__(self, table_name: str):
def __init__(self, table_name: str) -> None:
"""Establish connection to Dynamo.
Args:
table_name: The name of the Dynamo table containing match information.
"""
self._table = DYNAMODB.Table(table_name)

def _most_recent_item(self, sha: str) -> Union[
Tuple[int, Set[str], Set[str], Set[str]], None]:
def _most_recent_item(self, sha: str) -> Optional[Tuple[int, Set[str], Set[str], Set[str]]]:
"""Query the table for the most recent entry with the given SHA.
Args:
Expand All @@ -188,7 +188,7 @@ def _most_recent_item(self, sha: str) -> Union[
# When re-analyzing all binaries, only one S3 object will be added to the DB at a time.
# In order to prevent spurious alerts about new S3 objects, we report S3 objects from
# the previous Lambda version as well.
previous_s3_objects = {}
previous_s3_objects: Set[str] = set()
if len(most_recent_items) >= 2:
previous_s3_objects = set(most_recent_items[1]['S3Objects'])
return analyzer_version, matched_rules, s3_objects, previous_s3_objects
Expand Down
54 changes: 28 additions & 26 deletions lambda_functions/analyzer/binary_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import tempfile
import time
from typing import Any, Dict, Set
from typing import Any, Dict, List, Set
import uuid

if __package__:
Expand All @@ -11,16 +11,17 @@
from lambda_functions.analyzer.common import LOGGER
from lambda_functions.analyzer.yara_analyzer import YaraAnalyzer
else:
import analyzer_aws_lib
from common import LOGGER
import file_hash
from yara_analyzer import YaraAnalyzer
# mypy complains about duplicate definitions
import analyzer_aws_lib # type: ignore
from common import LOGGER # type: ignore
import file_hash # type: ignore
from yara_analyzer import YaraAnalyzer # type: ignore


class BinaryInfo(object):
"""Organizes the analysis of a single binary blob in S3."""

def __init__(self, bucket_name: str, object_key: str, yara_analyzer: YaraAnalyzer):
def __init__(self, bucket_name: str, object_key: str, yara_analyzer: YaraAnalyzer) -> None:
"""Create a new BinaryInfo.
Args:
Expand All @@ -37,18 +38,18 @@ def __init__(self, bucket_name: str, object_key: str, yara_analyzer: YaraAnalyze
self.yara_analyzer = yara_analyzer

# Computed after file download and analysis.
self.download_time_ms = 0
self.download_time_ms = 0.0
self.s3_last_modified = ''
self.s3_metadata = {}
self.computed_md5 = None
self.computed_sha = None
self.yara_matches = [] # List of yara.Match objects.
self.s3_metadata: Dict[str, str] = dict()
self.computed_md5 = ''
self.computed_sha = ''
self.yara_matches: List[Any] = list() # List of yara.Match objects (not an importable type)

def __str__(self):
def __str__(self) -> str:
"""Use the S3 identifier as the string representation of the binary."""
return self.s3_identifier

def _download_from_s3(self):
def _download_from_s3(self) -> None:
"""Download binary from S3 and measure elapsed time."""
LOGGER.debug('Downloading %s to %s', self.object_key, self.download_path)

Expand Down Expand Up @@ -107,25 +108,26 @@ def save_matches_and_alert(

def summary(self) -> Dict[str, Any]:
"""Generate a summary dictionary of binary attributes."""
result = {
matched_rules = {
'Rule{}'.format(index): {
# YARA string IDs, e.g. "$string1"
'MatchedStrings': list(sorted(set(t[1] for t in match.strings))),
'Meta': match.meta,
'RuleFile': match.namespace,
'RuleName': match.rule,
'RuleTags': match.tags
}
for index, match in enumerate(self.yara_matches, start=1)
}

return {
'FileInfo': {
'MD5': self.computed_md5,
'S3LastModified': self.s3_last_modified,
'S3Location': self.s3_identifier,
'S3Metadata': self.s3_metadata,
'SHA256': self.computed_sha
},
'MatchedRules': {},
'MatchedRules': matched_rules,
'NumMatchedRules': len(self.yara_matches)
}

for index, match in enumerate(self.yara_matches, start=1):
result['MatchedRules']['Rule{}'.format(index)] = {
# YARA string IDs, e.g. "$string1"
'MatchedStrings': list(sorted(set(t[1] for t in match.strings))),
'Meta': match.meta,
'RuleFile': match.namespace,
'RuleName': match.rule,
'RuleTags': match.tags
}
return result
5 changes: 2 additions & 3 deletions lambda_functions/analyzer/file_hash.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""Memory-efficient file hashing."""
import hashlib
import io
from typing import Tuple
from typing import Generator, IO, Tuple

MB = 2 ** 20 # ~ 1 million bytes


def _read_in_chunks(file_object: io.FileIO, chunk_size: int = 2*MB) -> str:
def _read_in_chunks(file_object: IO[bytes], chunk_size: int = 2*MB) -> Generator[bytes, None, None]:
"""Read a file in fixed-size chunks (to minimize memory usage for large files).
Args:
Expand Down
11 changes: 6 additions & 5 deletions lambda_functions/analyzer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Expects a binary YARA rules file to be at './compiled_yara_rules.bin'
import os
from typing import Any, Dict
import urllib
import urllib.parse

from botocore.exceptions import ClientError as BotoError

Expand All @@ -16,10 +16,11 @@
from lambda_functions.analyzer import analyzer_aws_lib, binary_info, yara_analyzer
from lambda_functions.analyzer.common import COMPILED_RULES_FILEPATH, LOGGER
else:
import analyzer_aws_lib
import binary_info
from common import COMPILED_RULES_FILEPATH, LOGGER
import yara_analyzer
# mypy complains about duplicate definitions
import analyzer_aws_lib # type: ignore
import binary_info # type: ignore
from common import COMPILED_RULES_FILEPATH, LOGGER # type: ignore
import yara_analyzer # type: ignore

# Build the YaraAnalyzer from the compiled rules file at import time (i.e. once per container).
# This saves 50-100+ ms per Lambda invocation, depending on the size of the rules file.
Expand Down
2 changes: 1 addition & 1 deletion lambda_functions/analyzer/yara_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class YaraAnalyzer(object):
"""Encapsulates YARA analysis and matching functions."""

def __init__(self, rules_file: str):
def __init__(self, rules_file: str) -> None:
"""Initialize the analyzer with a prebuilt binary YARA rules file.
Args:
Expand Down

0 comments on commit 54b0589

Please sign in to comment.